Add host enrollment cooldown period (#112)

This addresses an issue some users experienced in which performance
problems were encountered when hosts were "competing" for enrollment
using the same osquery host identifier. The issue is addressed by adding
a cooldown period for host enrollment, preventing the same (as judged by
osquery host identifier) host from enrolling more than once per minute.

When users end up in the problematic scenario, they will see quite a bit
of error logs due to this issue. For now that's probably a good thing as
users need to be aware of the lack of visibility. We can explore rate
limiting the logging if that becomes an issue for someone.

Fixes #102
This commit is contained in:
Zach Wasserman 2020-12-10 11:04:58 -08:00 committed by GitHub
parent 3ce7351049
commit beb7e8b965
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 121 additions and 41 deletions

1
go.mod
View file

@ -48,6 +48,7 @@ require (
github.com/patrickmn/sortutil v0.0.0-20120526081524-abeda66eb583
github.com/pelletier/go-toml v1.1.0 // indirect
github.com/pkg/errors v0.8.1
github.com/pressly/goose v2.6.0+incompatible
github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829
github.com/russellhaering/gosaml2 v0.3.1
github.com/russellhaering/goxmldsig v0.0.0-20180430223755-7acd5e4a6ef7

2
go.sum
View file

@ -166,6 +166,8 @@ github.com/pkg/term v0.0.0-20190109203006-aa71e9d9e942 h1:A7GG7zcGjl3jqAqGPmcNjd
github.com/pkg/term v0.0.0-20190109203006-aa71e9d9e942/go.mod h1:eCbImbZ95eXtAUIbLAuAVnBnwf83mjf6QIVH8SHYwqQ=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pressly/goose v2.6.0+incompatible h1:3f8zIQ8rfgP9tyI0Hmcs2YNAqUCL1c+diLe3iU8Qd/k=
github.com/pressly/goose v2.6.0+incompatible/go.mod h1:m+QHWCqxR3k8D9l7qfzuC/djtlfzxr34mozWDYEu1z8=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829 h1:D+CiwcpGTW6pL6bv6KI3KbyEyCKyS+1JWS2h8PNDnGA=
github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs=

View file

@ -229,14 +229,18 @@ func testListHostsStatus(t *testing.T, ds kolide.Datastore) {
func testEnrollHost(t *testing.T, ds kolide.Datastore) {
test.AddAllHostsLabel(t, ds)
var hosts []*kolide.Host
enrollSecretName := "default"
for _, tt := range enrollTests {
h, err := ds.EnrollHost(tt.uuid, tt.nodeKey, "default")
h, err := ds.EnrollHost(tt.uuid, tt.nodeKey, enrollSecretName)
require.Nil(t, err)
hosts = append(hosts, h)
assert.Equal(t, tt.uuid, h.OsqueryHostID)
assert.NotEmpty(t, h.NodeKey)
assert.Equal(t, tt.nodeKey, h.NodeKey)
assert.Equal(t, enrollSecretName, h.EnrollSecretName)
// This host should not be allowed to re-enroll immediately
_, err = ds.EnrollHost(tt.uuid, tt.nodeKey+"new", enrollSecretName+"new")
require.Error(t, err)
}
}

View file

@ -5,8 +5,9 @@ import (
"fmt"
"time"
"github.com/jmoiron/sqlx"
"github.com/cenkalti/backoff/v4"
"github.com/fleetdm/fleet/server/kolide"
"github.com/jmoiron/sqlx"
"github.com/pkg/errors"
)
@ -287,44 +288,74 @@ func (d *Datastore) EnrollHost(osqueryHostID, nodeKey, secretName string) (*koli
return nil, fmt.Errorf("missing osquery host identifier")
}
zeroTime := time.Unix(0, 0).Add(24 * time.Hour)
sqlInsert := `
INSERT INTO hosts (
detail_update_time,
label_update_time,
osquery_host_id,
seen_time,
node_key,
enroll_secret_name
) VALUES (?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
node_key = VALUES(node_key)
`
var host kolide.Host
err := d.withRetryTxx(func(tx *sqlx.Tx) error {
zeroTime := time.Unix(0, 0).Add(24 * time.Hour)
var result sql.Result
result, err := d.db.Exec(sqlInsert, zeroTime, zeroTime, osqueryHostID, time.Now().UTC(), nodeKey, secretName)
var id int64
err := tx.Get(&host, `SELECT id, last_enroll_time FROM hosts WHERE osquery_host_id = ?`, osqueryHostID)
if err != nil {
// Create new host record
sqlInsert := `
INSERT INTO hosts (
detail_update_time,
label_update_time,
osquery_host_id,
seen_time,
node_key,
enroll_secret_name
) VALUES (?, ?, ?, ?, ?, ?)
`
result, err := tx.Exec(sqlInsert, zeroTime, zeroTime, osqueryHostID, time.Now().UTC(), nodeKey, secretName)
if err != nil {
return errors.Wrap(err, "insert host")
}
id, _ = result.LastInsertId()
} else {
// Prevent hosts from enrolling too often with the same identifier.
// Prior to adding this we saw many hosts (probably VMs) with the
// same identifier competing for enrollment and causing perf issues.
if time.Since(host.LastEnrollTime) < kolide.HostEnrollCooldown {
return backoff.Permanent(fmt.Errorf("host identified by %s enrolling too often", osqueryHostID))
}
id = int64(host.ID)
// Update existing host record
sqlUpdate := `
UPDATE hosts
SET node_key = ?,
enroll_secret_name = ?,
last_enroll_time = NOW()
WHERE osquery_host_id = ?
`
_, err := tx.Exec(sqlUpdate, nodeKey, secretName, osqueryHostID)
if err != nil {
return errors.Wrap(err, "update host")
}
}
sqlSelect := `
SELECT * FROM hosts WHERE id = ? LIMIT 1
`
err = tx.Get(&host, sqlSelect, id)
if err != nil {
return errors.Wrap(err, "getting the host to return")
}
_, err = tx.Exec(`INSERT IGNORE INTO label_membership (host_id, label_id) VALUES (?, (SELECT id FROM labels WHERE name = 'All Hosts' AND label_type = 1))`, id)
if err != nil {
return errors.Wrap(err, "insert new host into all hosts label")
}
return nil
})
if err != nil {
return nil, errors.Wrap(err, "inserting")
return nil, err
}
id, _ := result.LastInsertId()
sqlSelect := `
SELECT * FROM hosts WHERE id = ? LIMIT 1
`
host := &kolide.Host{}
err = d.db.Get(host, sqlSelect, id)
if err != nil {
return nil, errors.Wrap(err, "getting the host to return")
}
_, err = d.db.Exec(`INSERT IGNORE INTO label_membership (host_id, label_id) VALUES (?, (SELECT id FROM labels WHERE name = 'All Hosts' AND label_type = 1))`, id)
if err != nil {
return nil, errors.Wrap(err, "insert new host into all hosts label")
}
return host, nil
return &host, nil
}
func (d *Datastore) AuthenticateHost(nodeKey string) (*kolide.Host, error) {

View file

@ -16,7 +16,7 @@ func Up_20200504120000(tx *sql.Tx) error {
"ADD COLUMN `additional` JSON DEFAULT NULL;",
)
if err != nil {
errors.Wrap(err, "add additional column")
return errors.Wrap(err, "add additional column")
}
_, err = tx.Exec(
@ -24,7 +24,7 @@ func Up_20200504120000(tx *sql.Tx) error {
"ADD COLUMN `additional_queries` JSON DEFAULT NULL;",
)
if err != nil {
errors.Wrap(err, "add additional_queries column")
return errors.Wrap(err, "add additional_queries column")
}
return nil

View file

@ -0,0 +1,32 @@
package tables
import (
"database/sql"
"github.com/pkg/errors"
)
func init() {
MigrationClient.AddMigration(Up_20201208121729, Down_20201208121729)
}
func Up_20201208121729(tx *sql.Tx) error {
_, err := tx.Exec(
"ALTER TABLE `hosts` " +
"ADD COLUMN `last_enroll_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP;",
)
if err != nil {
return errors.Wrap(err, "add last_enroll_time column")
}
_, err = tx.Exec("UPDATE hosts SET last_enroll_time = created_at")
if err != nil {
return errors.Wrap(err, "set last_enroll_time")
}
return nil
}
func Down_20201208121729(tx *sql.Tx) error {
return nil
}

View file

@ -36,6 +36,12 @@ const (
// online interval to avoid flapping of hosts that check in a bit later
// than their expected checkin interval.
OnlineIntervalBuffer = 30
// HostEnrollCooldown is how often a host can enroll. Users sometimes deploy
// osquery in a configuration in which multiple hosts use the same host
// identifier and Fleet needs to have a cooldown period to prevent this from
// cascading into out of control host enrollment.
HostEnrollCooldown = 1 * time.Minute
)
type HostStore interface {
@ -45,6 +51,9 @@ type HostStore interface {
SaveHost(host *Host) error
DeleteHost(hid uint) error
Host(id uint) (*Host, error)
// EnrollHost will enroll a new host with the given identifier, setting the
// node key, and enroll secret used for the host. Implementations of this
// method should respect HostEnrollCooldown.
EnrollHost(osqueryHostId, nodeKey, secretName string) (*Host, error)
ListHosts(opt HostListOptions) ([]*Host, error)
// AuthenticateHost authenticates and returns host metadata by node key.
@ -100,6 +109,7 @@ type Host struct {
OsqueryHostID string `json:"-" db:"osquery_host_id"`
DetailUpdateTime time.Time `json:"detail_updated_at" db:"detail_update_time"` // Time that the host details were last updated
LabelUpdateTime time.Time `json:"label_updated_at" db:"label_update_time"` // Time that the host details were last updated
LastEnrollTime time.Time `json:"last_enrolled_at" db:"last_enroll_time"` // Time that the host last enrolled
SeenTime time.Time `json:"seen_time" db:"seen_time"` // Time that the host was last "seen"
NodeKey string `json:"-" db:"node_key"`
HostName string `json:"hostname" db:"host_name"` // there is a fulltext index on this field