Simplify DB test/upgrade tool (#27141)

This PR simplifies the `test/upgrade` tool the QA team uses to test DB
upgrades.

- Removes "online migration" approach because we currently don't support
it (so it removes nginx as dependency).
- Adds a workflow to manually run this on Github actions (in case dev/QA
folks have issues with Docker on macOS, which is a common thing...)
- Adds logging to the output to ease troubleshoot (previous versions was
too quiet making it impossible to troubleshoot).
This commit is contained in:
Lucas Manuel Rodriguez 2025-03-14 17:07:41 -03:00 committed by GitHub
parent 990322321d
commit b30a008aac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 167 additions and 163 deletions

44
.github/workflows/db-upgrade-test.yml vendored Normal file
View file

@ -0,0 +1,44 @@
# This workflow can be used to test DB upgrades between two Fleet versions.
name: DB upgrade test
on:
workflow_dispatch: # allow manual action
inputs:
from-version:
description: "Docker tag of Fleet starting version, e.g. 'v4.64.2'"
required: true
type: string
to-version:
description: "Docker tag of Fleet version to upgrade to, e.g. 'rc-minor-fleet-v4.65.0'"
required: true
type: string
defaults:
run:
# fail-fast using bash -eo pipefail. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
shell: bash
permissions:
contents: read
jobs:
run-db-upgrade-test:
runs-on: ubuntu-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Checkout Code
uses: actions/checkout@629c2de402a417ea7690ca6ce3f33229e27606a5 # v2
- name: Install Go
uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe # v4.1.0
with:
go-version-file: 'go.mod'
- name: Run upgrade test
run: |
cd test/upgrade
FLEET_VERSION_A=${{ github.event.inputs.from-version }} FLEET_VERSION_B=${{ github.event.inputs.to-version }} go test -v

View file

@ -1,16 +1,15 @@
# Upgrade Tests
The tests located in `test/upgrade` are intended to test fleet upgrades with online migrations as proposed in [#6376](https://github.com/fleetdm/fleet/pull/6376).
To run the tests, you need to specify the from and to versions. For example
This tool can be used to test DB upgrades between two Fleet versions.
```
$ FLEET_VERSION_A=v4.16.0 FLEET_VERSION_B=v4.18.0 go test ./test/upgrade
To run the tests, you need to specify the "from" and "to" versions, for example:
```sh
FLEET_VERSION_A=v4.16.0 FLEET_VERSION_B=v4.18.0 go test ./test/upgrade
```
Ensure that Docker is installed with Compose V2.
To check if you have the correct version, run the following command
```
$ docker compose version
```sh
docker compose version
Docker Compose version v2.6.0
```

View file

@ -13,21 +13,8 @@ services:
redis:
image: redis:6
# reverse proxy and tls termination for fleet-a and fleet-b
fleet:
image: nginx
volumes:
# don't mount the config. These will be copied manually so that
# we can reload nginx without recreating containers and getting a new public port each time.
# - ./nginx/fleet-a.conf:/etc/nginx/conf.d/default.conf
- ./fleet.crt:/etc/nginx/fleet.crt
- ./fleet.key:/etc/nginx/fleet.key
ports:
- "443"
fleet-a:
&default-fleet
image: fleetdm/fleet:${FLEET_VERSION_A:-latest}
image: fleetdm/fleet:${FLEET_VERSION:-latest}
environment:
FLEET_MYSQL_ADDRESS: mysql:3306
FLEET_MYSQL_DATABASE: fleet
@ -35,33 +22,28 @@ services:
FLEET_MYSQL_PASSWORD: fleet
FLEET_REDIS_ADDRESS: redis:6379
FLEET_SERVER_ADDRESS: 0.0.0.0:8080
FLEET_SERVER_TLS: 'false'
FLEET_LOGGING_JSON: 'true'
FLEET_LICENSE_KEY: ${FLEET_LICENSE_KEY}
FLEET_SERVER_CERT: /fleet.crt
FLEET_SERVER_KEY: /fleet.key
FLEET_LOGGING_JSON: "true"
FLEET_OSQUERY_LABEL_UPDATE_INTERVAL: 1m
FLEET_VULNERABILITIES_CURRENT_INSTANCE_CHECKS: "yes"
FLEET_VULNERABILITIES_DATABASES_PATH: /fleet/vulndb
FLEET_VULNERABILITIES_PERIODICITY: 5m
FLEET_LOGGING_DEBUG: 'true'
# This can be configured for testing purposes but otherwise uses the
# typical default of provided.
FLEET_OSQUERY_HOST_IDENTIFIER: ${FLEET_OSQUERY_HOST_IDENTIFIER:-provided}
FLEET_LOGGING_DEBUG: "true"
volumes:
- ./fleet.crt:/fleet.crt
- ./fleet.key:/fleet.key
ports:
- "8080"
depends_on:
- mysql
- redis
# Uses a different version than fleet-a
fleet-b:
<<: *default-fleet
image: fleetdm/fleet:${FLEET_VERSION_B:-latest}
osquery:
image: "osquery/osquery:4.7.0-ubuntu20.04"
volumes:
- ./fleet.crt:/etc/osquery/fleet.crt
- ./osquery.flags:/etc/osquery/osquery.flags
environment:
ENROLL_SECRET: "${ENROLL_SECRET}"
ENROLL_SECRET: "${ENROLL_SECRET:-foobar}"
command: osqueryd --flagfile=/etc/osquery/osquery.flags

View file

@ -3,13 +3,14 @@ package upgrade
import (
"bytes"
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"math/rand"
"net/http"
"os"
"os/exec"
"path/filepath"
"strconv"
"testing"
"time"
@ -18,17 +19,12 @@ import (
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/client"
"github.com/fleetdm/fleet/v4/pkg/fleethttp"
"github.com/fleetdm/fleet/v4/server/service"
_ "github.com/go-sql-driver/mysql"
"github.com/jmoiron/sqlx"
)
// Slots correspond to docker-compose fleet services, either fleet-a or fleet-b
const (
slotA = "a"
slotB = "b"
)
func init() {
rand.Seed(time.Now().Unix())
}
@ -45,6 +41,7 @@ type Fleet struct {
Token string
dockerClient client.ContainerAPIClient
t *testing.T
}
// NewFleet starts fleet and it's dependencies with the specified version.
@ -60,15 +57,16 @@ func NewFleet(t *testing.T, version string) *Fleet {
f := &Fleet{
ProjectName: projectName,
FilePath: "docker-compose.yaml",
FilePath: "docker-compose.yml",
Version: version,
dockerClient: dockerClient,
t: t,
}
t.Cleanup(f.cleanup)
if err := f.Start(); err != nil {
t.Fatalf("start fleet: %v", err)
t.Fatalf("start fleet version A: %v", err)
}
return f
@ -76,15 +74,15 @@ func NewFleet(t *testing.T, version string) *Fleet {
func (f *Fleet) Start() error {
env := map[string]string{
"FLEET_VERSION_A": f.Version,
"FLEET_VERSION": f.Version,
}
_, err := f.execCompose(env, "pull", "--parallel")
_, err := f.execCompose(env, "pull")
if err != nil {
return err
}
// start mysql and wait until ready
_, err = f.execCompose(env, "up", "-d", "mysql")
_, err = f.execCompose(env, "up", "--remove-orphans", "-d", "mysql")
if err != nil {
return err
}
@ -92,38 +90,19 @@ func (f *Fleet) Start() error {
return err
}
// run the migrations using the fleet-a service
_, err = f.execCompose(env, "run", "-T", "fleet-a", "fleet", "prepare", "db", "--no-prompt")
// run the migrations using the fleet starting version
_, err = f.execCompose(env, "run", "-T", "fleet", "fleet", "prepare", "db", "--no-prompt")
if err != nil {
return err
}
// start fleet-a
_, err = f.execCompose(env, "up", "-d", "fleet-a", "fleet")
// start fleet
_, err = f.execCompose(env, "up", "--remove-orphans", "-d", "fleet", "fleet")
if err != nil {
return err
}
// copy the nginx conf and reload nginx without creating a new container
srcPath := filepath.Join("nginx", "fleet-a.conf")
_, err = f.execCompose(env, "cp", srcPath, "fleet:/etc/nginx/conf.d/default.conf")
if err != nil {
return err
}
// drop to one nginx worker process regardless of CPU count to ensure repointing to the correct
// Fleet container happens quickly
_, err = f.execCompose(env, "exec", "-T", "fleet", "sed", "-i", "s/auto/1/", "/etc/nginx/nginx.conf")
if err != nil {
return err
}
_, err = f.execCompose(env, "exec", "-T", "fleet", "nginx", "-s", "reload")
if err != nil {
return err
}
if err := f.waitFleet(slotA); err != nil {
if err := f.waitFleet(); err != nil {
return err
}
@ -136,7 +115,7 @@ func (f *Fleet) Start() error {
// Client returns a fleet client that uses the fleet API.
func (f *Fleet) Client() (*service.Client, error) {
port, err := f.getPublicPort("fleet", 443)
port, err := f.getPublicPort("fleet", 8080)
if err != nil {
return nil, fmt.Errorf("get fleet port: %v", err)
}
@ -168,6 +147,8 @@ func (f *Fleet) setupFleet() error {
}
func (f *Fleet) waitMYSQL() error {
f.t.Log("waiting for MySQL container to respond...")
// get the random mysql host port assigned by docker
port, err := f.getPublicPort("mysql", 3306)
if err != nil {
@ -175,9 +156,10 @@ func (f *Fleet) waitMYSQL() error {
}
dsn := fmt.Sprintf("fleet:fleet@tcp(localhost:%d)/fleet", port)
f.t.Logf("dsn: %s", dsn)
retryInterval := 5 * time.Second
timeout := 1 * time.Minute
timeout := 5 * time.Minute
ticker := time.NewTicker(retryInterval)
defer ticker.Stop()
@ -204,7 +186,7 @@ func (f *Fleet) getPublicPort(serviceName string, privatePort uint16) (uint16, e
// get the random fleet host port assigned by docker
argsName := filters.Arg("name", containerName)
containers, err := f.dockerClient.ContainerList(context.TODO(), container.ListOptions{Filters: filters.NewArgs(argsName), All: true})
containers, err := f.dockerClient.ContainerList(context.TODO(), container.ListOptions{Filters: filters.NewArgs(argsName)})
if err != nil {
return 0, err
}
@ -219,8 +201,10 @@ func (f *Fleet) getPublicPort(serviceName string, privatePort uint16) (uint16, e
return 0, errors.New("private port not found")
}
func (f *Fleet) waitFleet(slot string) error {
containerName := fmt.Sprintf("%s-fleet-%s-1", f.ProjectName, slot)
func (f *Fleet) waitFleet() error {
f.t.Logf("waiting for fleet %s to be healthy...", f.Version)
containerName := fmt.Sprintf("%s-fleet-1", f.ProjectName)
// get the random fleet host port assigned by docker
argsName := filters.Arg("name", containerName)
@ -232,15 +216,18 @@ func (f *Fleet) waitFleet(slot string) error {
return errors.New("no fleet container found")
}
port := containers[0].Ports[0].PublicPort
healthURL := fmt.Sprintf("http://localhost:%d/healthz", port)
healthURL := fmt.Sprintf("https://localhost:%d/healthz", port)
f.t.Logf("fleet URL: %s", healthURL)
retryStrategy := backoff.NewExponentialBackOff()
retryStrategy.MaxInterval = 1 * time.Second
//nolint:gosec // G107: Ok to trust docker here
client := fleethttp.NewClient(fleethttp.WithTLSClientConfig(&tls.Config{InsecureSkipVerify: true}))
if err := backoff.Retry(
func() error {
//nolint:gosec // G107: Ok to trust docker here
resp, err := http.Get(healthURL)
resp, err := client.Get(healthURL)
if err != nil {
return err
}
@ -253,6 +240,7 @@ func (f *Fleet) waitFleet(slot string) error {
); err != nil {
return fmt.Errorf("check health: %v", err)
}
f.t.Log("fleet is healthy")
return nil
}
@ -281,9 +269,12 @@ func (f *Fleet) execCompose(env map[string]string, args ...string) (string, erro
var stdout, stderr bytes.Buffer
cmd := exec.Command("docker", args...)
f.t.Log(cmd.String())
cmd.Env = e
cmd.Stdout = &stdout
cmd.Stderr = &stderr
wout := io.MultiWriter(&stdout, os.Stdout)
werr := io.MultiWriter(&stderr, os.Stderr)
cmd.Stdout = wout
cmd.Stderr = werr
err := cmd.Run()
if err != nil {
return "", fmt.Errorf("docker: %v %s", err, stderr.String())
@ -314,7 +305,7 @@ func (f *Fleet) StartHost() (string, error) {
env := map[string]string{
"ENROLL_SECRET": enrollSecret,
}
output, err := f.execCompose(env, "run", "-d", "-T", "osquery")
output, err := f.execCompose(env, "run", "--remove-orphans", "-d", "-T", "osquery")
if err != nil {
return "", err
}
@ -333,46 +324,78 @@ func (f *Fleet) StartHost() (string, error) {
}
// Upgrade upgrades fleet to a specified version.
func (f *Fleet) Upgrade(toVersion string) error {
func (f *Fleet) Upgrade(from, to string) error {
// stop fleet
env := map[string]string{
"FLEET_VERSION_B": toVersion,
"FLEET_VERSION": from,
}
// run migrations using fleet-b
serviceName := "fleet-b"
_, err := f.execCompose(env, "run", "-T", serviceName, "fleet", "prepare", "db", "--no-prompt")
if err != nil {
if _, err := f.execCompose(env, "rm", "-s", "-v", "fleet"); err != nil {
return fmt.Errorf("bring fleet down: %v", err)
}
// run migrations
env = map[string]string{
"FLEET_VERSION": to,
}
// we need to pull the new version
if _, err := f.execCompose(env, "pull"); err != nil {
return err
}
if _, err := f.execCompose(env, "run", "--remove-orphans", "-T", "fleet", "fleet", "prepare", "db", "--no-prompt"); err != nil {
return fmt.Errorf("run migrations: %v", err)
}
// start the service
_, err = f.execCompose(env, "up", "-d", serviceName)
if err != nil {
return fmt.Errorf("start fleet: %v", err)
// start the new version
if _, err := f.execCompose(env, "up", "--remove-orphans", "-d", "fleet", "fleet"); err != nil {
return fmt.Errorf("start fleet version B: %v", err)
}
f.Version = to
// wait until healthy
if err := f.waitFleet(slotB); err != nil {
if err := f.waitFleet(); err != nil {
return fmt.Errorf("wait for fleet to be healthy: %v", err)
}
// copy the nginx conf and reload nginx without creating a new container
srcPath := filepath.Join("nginx", "fleet-b.conf")
_, err = f.execCompose(env, "cp", srcPath, "fleet:/etc/nginx/conf.d/default.conf")
if err != nil {
return err
}
_, err = f.execCompose(env, "exec", "-T", "fleet", "nginx", "-s", "reload")
if err != nil {
return err
}
// even with only one worker process, graceful reload of nginx workers doesn't happen instantly,
// so we add a wait here to let workers swap so they're pointed at the upgraded Fleet server
time.Sleep(250 * time.Millisecond)
f.Version = toVersion
f.t.Log("upgraded successfully")
return nil
}
func enrollHost(t *testing.T, f *Fleet) (string, error) {
client, err := f.Client()
if err != nil {
return "", fmt.Errorf("creating fleet client: %w", err)
}
// enroll a host
hostname, err := f.StartHost()
if err != nil {
return "", fmt.Errorf("creating fleet client: %w", err)
}
// wait until host is enrolled and software is listed
retryStrategy := backoff.NewExponentialBackOff()
retryStrategy.InitialInterval = 5 * time.Second
retryStrategy.MaxInterval = 5 * time.Minute
if err := backoff.Retry(func() error {
host, err := client.HostByIdentifier(hostname)
if err != nil {
t.Logf("get host by identifier %s: %s", hostname, err)
return err
}
if len(host.Software) == 0 {
t.Logf("software for %s not reported yet", hostname)
return errors.New("no software reported yet")
}
return nil
}, retryStrategy); err != nil {
return "", fmt.Errorf("host enroll retry: %w", err)
}
return hostname, nil
}

View file

@ -1,11 +0,0 @@
server {
listen 443 ssl;
server_name fleet;
ssl_certificate fleet.crt;
ssl_certificate_key fleet.key;
location / {
proxy_pass http://fleet-a:8080;
}
}

View file

@ -1,11 +0,0 @@
server {
listen 443 ssl;
server_name fleet;
ssl_certificate fleet.crt;
ssl_certificate_key fleet.key;
location / {
proxy_pass http://fleet-b:8080;
}
}

View file

@ -1,7 +1,8 @@
--verbose=true
--verbose
--debug
--tls_dump
--tls_hostname=fleet
--tls_hostname=fleet:8080
--tls_server_certs=/etc/osquery/fleet.crt
--enroll_secret_env=ENROLL_SECRET

View file

@ -3,37 +3,10 @@ package upgrade
import (
"os"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func enrollHost(t *testing.T, f *Fleet) string {
client, err := f.Client()
require.NoError(t, err)
// enroll a host
hostname, err := f.StartHost()
require.NoError(t, err)
// wait until host is enrolled and software is listed
require.Eventually(t, func() bool {
host, err := client.HostByIdentifier(hostname)
if err != nil {
t.Logf("get host: %v", err)
return false
}
if len(host.Software) == 0 {
return false
}
return true
}, 5*time.Minute, 5*time.Second)
return hostname
}
func TestUpgradeAToB(t *testing.T) {
versionA := os.Getenv("FLEET_VERSION_A")
if versionA == "" {
@ -47,11 +20,15 @@ func TestUpgradeAToB(t *testing.T) {
f := NewFleet(t, versionA)
enrollHost(t, f)
hostname, err := enrollHost(t, f)
require.NoError(t, err)
t.Logf("first host %s enrolled successfully", hostname)
err := f.Upgrade(versionB)
err = f.Upgrade(versionA, versionB)
require.NoError(t, err)
// enroll another host with the new version
enrollHost(t, f)
hostname, err = enrollHost(t, f)
require.NoError(t, err)
t.Logf("second host %s enrolled successfully", hostname)
}