From beb7e8b965c77862dde8d79eec637565aae97602 Mon Sep 17 00:00:00 2001 From: Zach Wasserman Date: Thu, 10 Dec 2020 11:04:58 -0800 Subject: [PATCH] Add host enrollment cooldown period (#112) This addresses an issue some users experienced in which performance problems were encountered when hosts were "competing" for enrollment using the same osquery host identifier. The issue is addressed by adding a cooldown period for host enrollment, preventing the same (as judged by osquery host identifier) host from enrolling more than once per minute. When users end up in the problematic scenario, they will see quite a bit of error logs due to this issue. For now that's probably a good thing as users need to be aware of the lack of visibility. We can explore rate limiting the logging if that becomes an issue for someone. Fixes #102 --- go.mod | 1 + go.sum | 2 + server/datastore/datastore_hosts_test.go | 12 ++- server/datastore/mysql/hosts.go | 101 ++++++++++++------ .../20200504120000_AddAdditionalToHosts.go | 4 +- ...1208121729_AddLastEnrollmentTimeToHosts.go | 32 ++++++ server/kolide/hosts.go | 10 ++ 7 files changed, 121 insertions(+), 41 deletions(-) create mode 100644 server/datastore/mysql/migrations/tables/20201208121729_AddLastEnrollmentTimeToHosts.go diff --git a/go.mod b/go.mod index a7aa4d27a3..a1f3545adf 100644 --- a/go.mod +++ b/go.mod @@ -48,6 +48,7 @@ require ( github.com/patrickmn/sortutil v0.0.0-20120526081524-abeda66eb583 github.com/pelletier/go-toml v1.1.0 // indirect github.com/pkg/errors v0.8.1 + github.com/pressly/goose v2.6.0+incompatible github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829 github.com/russellhaering/gosaml2 v0.3.1 github.com/russellhaering/goxmldsig v0.0.0-20180430223755-7acd5e4a6ef7 diff --git a/go.sum b/go.sum index 2233aa4c4b..90a794f6e3 100644 --- a/go.sum +++ b/go.sum @@ -166,6 +166,8 @@ github.com/pkg/term v0.0.0-20190109203006-aa71e9d9e942 h1:A7GG7zcGjl3jqAqGPmcNjd github.com/pkg/term v0.0.0-20190109203006-aa71e9d9e942/go.mod h1:eCbImbZ95eXtAUIbLAuAVnBnwf83mjf6QIVH8SHYwqQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pressly/goose v2.6.0+incompatible h1:3f8zIQ8rfgP9tyI0Hmcs2YNAqUCL1c+diLe3iU8Qd/k= +github.com/pressly/goose v2.6.0+incompatible/go.mod h1:m+QHWCqxR3k8D9l7qfzuC/djtlfzxr34mozWDYEu1z8= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829 h1:D+CiwcpGTW6pL6bv6KI3KbyEyCKyS+1JWS2h8PNDnGA= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= diff --git a/server/datastore/datastore_hosts_test.go b/server/datastore/datastore_hosts_test.go index 44517b59f8..2a56063cff 100644 --- a/server/datastore/datastore_hosts_test.go +++ b/server/datastore/datastore_hosts_test.go @@ -229,14 +229,18 @@ func testListHostsStatus(t *testing.T, ds kolide.Datastore) { func testEnrollHost(t *testing.T, ds kolide.Datastore) { test.AddAllHostsLabel(t, ds) - var hosts []*kolide.Host + enrollSecretName := "default" for _, tt := range enrollTests { - h, err := ds.EnrollHost(tt.uuid, tt.nodeKey, "default") + h, err := ds.EnrollHost(tt.uuid, tt.nodeKey, enrollSecretName) require.Nil(t, err) - hosts = append(hosts, h) assert.Equal(t, tt.uuid, h.OsqueryHostID) - assert.NotEmpty(t, h.NodeKey) + assert.Equal(t, tt.nodeKey, h.NodeKey) + assert.Equal(t, enrollSecretName, h.EnrollSecretName) + + // This host should not be allowed to re-enroll immediately + _, err = ds.EnrollHost(tt.uuid, tt.nodeKey+"new", enrollSecretName+"new") + require.Error(t, err) } } diff --git a/server/datastore/mysql/hosts.go b/server/datastore/mysql/hosts.go index 8b32a022fc..f45bd87460 100644 --- a/server/datastore/mysql/hosts.go +++ b/server/datastore/mysql/hosts.go @@ -5,8 +5,9 @@ import ( "fmt" "time" - "github.com/jmoiron/sqlx" + "github.com/cenkalti/backoff/v4" "github.com/fleetdm/fleet/server/kolide" + "github.com/jmoiron/sqlx" "github.com/pkg/errors" ) @@ -287,44 +288,74 @@ func (d *Datastore) EnrollHost(osqueryHostID, nodeKey, secretName string) (*koli return nil, fmt.Errorf("missing osquery host identifier") } - zeroTime := time.Unix(0, 0).Add(24 * time.Hour) - sqlInsert := ` - INSERT INTO hosts ( - detail_update_time, - label_update_time, - osquery_host_id, - seen_time, - node_key, - enroll_secret_name - ) VALUES (?, ?, ?, ?, ?, ?) - ON DUPLICATE KEY UPDATE - node_key = VALUES(node_key) - ` + var host kolide.Host + err := d.withRetryTxx(func(tx *sqlx.Tx) error { + zeroTime := time.Unix(0, 0).Add(24 * time.Hour) - var result sql.Result - result, err := d.db.Exec(sqlInsert, zeroTime, zeroTime, osqueryHostID, time.Now().UTC(), nodeKey, secretName) + var id int64 + err := tx.Get(&host, `SELECT id, last_enroll_time FROM hosts WHERE osquery_host_id = ?`, osqueryHostID) + if err != nil { + // Create new host record + sqlInsert := ` + INSERT INTO hosts ( + detail_update_time, + label_update_time, + osquery_host_id, + seen_time, + node_key, + enroll_secret_name + ) VALUES (?, ?, ?, ?, ?, ?) + ` + result, err := tx.Exec(sqlInsert, zeroTime, zeroTime, osqueryHostID, time.Now().UTC(), nodeKey, secretName) + + if err != nil { + return errors.Wrap(err, "insert host") + } + + id, _ = result.LastInsertId() + } else { + // Prevent hosts from enrolling too often with the same identifier. + // Prior to adding this we saw many hosts (probably VMs) with the + // same identifier competing for enrollment and causing perf issues. + if time.Since(host.LastEnrollTime) < kolide.HostEnrollCooldown { + return backoff.Permanent(fmt.Errorf("host identified by %s enrolling too often", osqueryHostID)) + } + id = int64(host.ID) + // Update existing host record + sqlUpdate := ` + UPDATE hosts + SET node_key = ?, + enroll_secret_name = ?, + last_enroll_time = NOW() + WHERE osquery_host_id = ? + ` + _, err := tx.Exec(sqlUpdate, nodeKey, secretName, osqueryHostID) + + if err != nil { + return errors.Wrap(err, "update host") + } + } + + sqlSelect := ` + SELECT * FROM hosts WHERE id = ? LIMIT 1 + ` + err = tx.Get(&host, sqlSelect, id) + if err != nil { + return errors.Wrap(err, "getting the host to return") + } + + _, err = tx.Exec(`INSERT IGNORE INTO label_membership (host_id, label_id) VALUES (?, (SELECT id FROM labels WHERE name = 'All Hosts' AND label_type = 1))`, id) + if err != nil { + return errors.Wrap(err, "insert new host into all hosts label") + } + + return nil + }) if err != nil { - return nil, errors.Wrap(err, "inserting") + return nil, err } - - id, _ := result.LastInsertId() - sqlSelect := ` - SELECT * FROM hosts WHERE id = ? LIMIT 1 - ` - host := &kolide.Host{} - err = d.db.Get(host, sqlSelect, id) - if err != nil { - return nil, errors.Wrap(err, "getting the host to return") - } - - _, err = d.db.Exec(`INSERT IGNORE INTO label_membership (host_id, label_id) VALUES (?, (SELECT id FROM labels WHERE name = 'All Hosts' AND label_type = 1))`, id) - if err != nil { - return nil, errors.Wrap(err, "insert new host into all hosts label") - } - - return host, nil - + return &host, nil } func (d *Datastore) AuthenticateHost(nodeKey string) (*kolide.Host, error) { diff --git a/server/datastore/mysql/migrations/tables/20200504120000_AddAdditionalToHosts.go b/server/datastore/mysql/migrations/tables/20200504120000_AddAdditionalToHosts.go index d6269f18c6..d3c307382a 100644 --- a/server/datastore/mysql/migrations/tables/20200504120000_AddAdditionalToHosts.go +++ b/server/datastore/mysql/migrations/tables/20200504120000_AddAdditionalToHosts.go @@ -16,7 +16,7 @@ func Up_20200504120000(tx *sql.Tx) error { "ADD COLUMN `additional` JSON DEFAULT NULL;", ) if err != nil { - errors.Wrap(err, "add additional column") + return errors.Wrap(err, "add additional column") } _, err = tx.Exec( @@ -24,7 +24,7 @@ func Up_20200504120000(tx *sql.Tx) error { "ADD COLUMN `additional_queries` JSON DEFAULT NULL;", ) if err != nil { - errors.Wrap(err, "add additional_queries column") + return errors.Wrap(err, "add additional_queries column") } return nil diff --git a/server/datastore/mysql/migrations/tables/20201208121729_AddLastEnrollmentTimeToHosts.go b/server/datastore/mysql/migrations/tables/20201208121729_AddLastEnrollmentTimeToHosts.go new file mode 100644 index 0000000000..42a2d88190 --- /dev/null +++ b/server/datastore/mysql/migrations/tables/20201208121729_AddLastEnrollmentTimeToHosts.go @@ -0,0 +1,32 @@ +package tables + +import ( + "database/sql" + + "github.com/pkg/errors" +) + +func init() { + MigrationClient.AddMigration(Up_20201208121729, Down_20201208121729) +} + +func Up_20201208121729(tx *sql.Tx) error { + _, err := tx.Exec( + "ALTER TABLE `hosts` " + + "ADD COLUMN `last_enroll_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP;", + ) + if err != nil { + return errors.Wrap(err, "add last_enroll_time column") + } + + _, err = tx.Exec("UPDATE hosts SET last_enroll_time = created_at") + if err != nil { + return errors.Wrap(err, "set last_enroll_time") + } + + return nil +} + +func Down_20201208121729(tx *sql.Tx) error { + return nil +} diff --git a/server/kolide/hosts.go b/server/kolide/hosts.go index 30fd2e2889..e393835649 100644 --- a/server/kolide/hosts.go +++ b/server/kolide/hosts.go @@ -36,6 +36,12 @@ const ( // online interval to avoid flapping of hosts that check in a bit later // than their expected checkin interval. OnlineIntervalBuffer = 30 + + // HostEnrollCooldown is how often a host can enroll. Users sometimes deploy + // osquery in a configuration in which multiple hosts use the same host + // identifier and Fleet needs to have a cooldown period to prevent this from + // cascading into out of control host enrollment. + HostEnrollCooldown = 1 * time.Minute ) type HostStore interface { @@ -45,6 +51,9 @@ type HostStore interface { SaveHost(host *Host) error DeleteHost(hid uint) error Host(id uint) (*Host, error) + // EnrollHost will enroll a new host with the given identifier, setting the + // node key, and enroll secret used for the host. Implementations of this + // method should respect HostEnrollCooldown. EnrollHost(osqueryHostId, nodeKey, secretName string) (*Host, error) ListHosts(opt HostListOptions) ([]*Host, error) // AuthenticateHost authenticates and returns host metadata by node key. @@ -100,6 +109,7 @@ type Host struct { OsqueryHostID string `json:"-" db:"osquery_host_id"` DetailUpdateTime time.Time `json:"detail_updated_at" db:"detail_update_time"` // Time that the host details were last updated LabelUpdateTime time.Time `json:"label_updated_at" db:"label_update_time"` // Time that the host details were last updated + LastEnrollTime time.Time `json:"last_enrolled_at" db:"last_enroll_time"` // Time that the host last enrolled SeenTime time.Time `json:"seen_time" db:"seen_time"` // Time that the host was last "seen" NodeKey string `json:"-" db:"node_key"` HostName string `json:"hostname" db:"host_name"` // there is a fulltext index on this field