fleet/.github/workflows/integration.yml
Victor Lyuboslavsky 1d9131a602
Improve integration workflow robustness with health checks and detailed enrollment logging. (#32348)
Fixes #32347

# Checklist for submitter

## Testing

- [x] QA'd all new/changed functionality manually


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- New Features
- Added health checks and elapsed-time logging during server startup and
host enrollment in the integration workflow.
- Bug Fixes
- Reduced flakiness by adding bounded login retries and server readiness
verification before proceeding.
- Tests
- Periodic diagnostics for host enrollment status to aid visibility
during runs.
- Chores
- Increased server startup timeout from 10 to 15 minutes in the
integration workflow.
  - Minor workflow formatting cleanups for consistency.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-08-27 14:52:48 -05:00

483 lines
19 KiB
YAML

# This workflow tests enrolling of agents on the supported platforms,
# using the latest version of fleet, fleetctl and orbit.
#
# It starts the latest release of fleet with the "fleetctl preview" command.
# It generates the installers for the latest version of Orbit with the
# "fleetctl package" command.
name: Test Fleetctl, Orbit & Preview
on:
workflow_dispatch: # Manual
schedule:
- cron: '0 2 * * *' # Nightly 2AM UTC
# This allows a subsequently queued workflow run to interrupt previous runs
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id}}
cancel-in-progress: true
defaults:
run:
# fail-fast using bash -eo pipefail. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
shell: bash
permissions:
contents: read
jobs:
gen:
runs-on: ubuntu-latest
outputs:
subdomain: ${{ steps.gen.outputs.subdomain }}
address: ${{ steps.gen.outputs.address }}
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- id: gen
run: |
UUID=$(uuidgen)
echo "subdomain=fleet-test-$UUID" >> $GITHUB_OUTPUT
echo "address=https://fleet-test-$UUID.fleetuem.com" >> $GITHUB_OUTPUT
run-server:
runs-on: ubuntu-latest
needs: gen
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Start tunnel
env:
CERT_PEM: ${{ secrets.CLOUDFLARE_TUNNEL_FLEETUEM_CERT_B64 }}
run: |
# Increase maximum receive buffer size to roughly 2.5 MB.
# Cloudflared uses quic-go. This buffer holds packets that have been received by the kernel,
# but not yet read by the application (quic-go in this case). Once this buffer fills up, the
# kernel will drop any new incoming packet.
# See https://github.com/quic-go/quic-go/wiki/UDP-Receive-Buffer-Size.
sudo sysctl -w net.core.rmem_max=2500000
# Install cloudflared
#
# We pin to version 2025.5.0 because something broke with 2025.6.1.
# 2025.6.1 fails with "failed to create tunnel: Unknown output format 'default'"
wget https://github.com/cloudflare/cloudflared/releases/download/2025.5.0/cloudflared-linux-amd64.deb
sudo dpkg -i cloudflared-linux-amd64.deb
# Add secret
echo "$CERT_PEM" | base64 -d > cert.pem
# Start tunnel
cloudflared tunnel --origincert cert.pem --hostname ${{ needs.gen.outputs.subdomain }} --url http://localhost:1337 --name ${{ needs.gen.outputs.subdomain }} --logfile cloudflared.log &
until [[ $(cloudflared tunnel --origincert cert.pem info -o json ${{ needs.gen.outputs.subdomain }} | jq '.conns[0].conns[0].is_pending_reconnect') = false ]]; do
echo "Awaiting tunnel ready..."
sleep 5
done
# Download fleet and fleetctl binaries from last successful build on main
- name: Download binaries
uses: dawidd6/action-download-artifact@5e780fc7bbd0cac69fc73271ed86edf5dcb72d67
with:
workflow: build-binaries.yaml
branch: main
name: build
path: build
check_artifacts: true
- name: Run Fleet server
timeout-minutes: 15
run: |
chmod +x ./build/fleetctl
./build/fleetctl preview --no-hosts --disable-open-browser
./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
./build/fleetctl get enroll-secret
docker compose -f ~/.fleet/preview/docker-compose.yml logs --follow fleet01 fleet02 &
# Ensure Fleet server is responding before waiting for enrollments
echo "Checking Fleet server health..."
HEALTH_CHECK_COUNT=0
until curl -s -o /dev/null -w "%{http_code}" http://localhost:1337/healthz | grep -q "200"; do
HEALTH_CHECK_COUNT=$((HEALTH_CHECK_COUNT + 1))
if [ $HEALTH_CHECK_COUNT -ge 30 ]; then
echo "ERROR: Fleet server not responding after 150 seconds"
docker ps -a --filter "name=fleet"
exit 1
fi
sleep 5
done
echo "Fleet server is responding"
# Wait for all of the hosts to be enrolled
EXPECTED=3
ENROLLMENT_START=$(date +%s)
until [ $(./build/fleetctl get hosts --json | grep -v "No hosts found" | wc -l | tee hostcount) -ge $EXPECTED ]; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo -n "Waiting for hosts to enroll (${ELAPSED}s): "
cat hostcount | xargs echo -n
echo " / $EXPECTED"
# Show diagnostic info every 60 seconds
if [ $((ELAPSED % 60)) -lt 10 ]; then
./build/fleetctl get hosts --json || true
fi
sleep 10
done
echo "Success! $EXPECTED hosts enrolled."
- name: Show enrolled hosts
if: always()
run: |
./build/fleetctl get hosts --json
- name: Slack Notification
if: failure()
uses: slackapi/slack-github-action@e28cf165c92ffef168d23c5c9000cffc8a25e117 # v1.24.0
with:
payload: |
{
"text": "${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head.html_url }}",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Integration test result: ${{ job.status }}\nhttps://github.com/fleetdm/fleet/actions/runs/${{ github.run_id }}\n${{ github.event.pull_request.html_url || github.event.head.html_url }}"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_G_HELP_ENGINEERING_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
- name: Cleanup tunnel
if: always()
run: cloudflared tunnel --origincert cert.pem delete --force ${{ needs.gen.outputs.subdomain }}
- name: Upload cloudflared logs
if: always()
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: cloudflared.log
path: cloudflared.log
login:
runs-on: ubuntu-latest
needs: gen
outputs:
token: ${{ steps.login.outputs.token }}
steps:
# Download fleet and fleetctl binaries from last successful build on main
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Download binaries
uses: dawidd6/action-download-artifact@5e780fc7bbd0cac69fc73271ed86edf5dcb72d67
with:
workflow: build-binaries.yaml
branch: main
name: build
path: build
check_artifacts: true
# Login only here and share the token because otherwise we could hit rate limits.
- id: login
name: Attempt login
timeout-minutes: 5
run: |
chmod +x ./build/fleetctl
./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
# Wait for Fleet server to be reachable first
echo "Waiting for Fleet server to be ready..."
ATTEMPT=0
until curl -s -o /dev/null -w "%{http_code}" ${{ needs.gen.outputs.address }}/healthz | grep -q "200"; do
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -ge 60 ]; then
echo "ERROR: Fleet server not reachable after 5 minutes"
exit 1
fi
echo "Waiting for server... attempt $ATTEMPT/60"
sleep 5
done
echo "Fleet server is responding, attempting login..."
ATTEMPT=0
until ./build/fleetctl login --email admin@example.com --password preview1337#; do
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -ge 30 ]; then
echo "ERROR: Failed to login after $ATTEMPT attempts"
exit 1
fi
echo "Login attempt $ATTEMPT failed, retrying in 5s..."
sleep 5
done
TOKEN=$(cat ~/.fleet/config| grep token | awk '{ print $2 }')
echo "token=$TOKEN" >> $GITHUB_OUTPUT
orbit-macos:
timeout-minutes: 10
strategy:
matrix:
# To run multiple VMs that have the same UUID we need to implement
# https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
# in the same host are enrolled as two hosts in Fleet).
# Until then we will just test the `stable` channel in all components.
#
# Alternatively, we can bring back the `edge` channel when we decide to upgrade
# our worker to macOS 13 in the future, as they changed the virtualization
# layer for 13 and now it has random UUIDs (https://github.com/actions/runner-images/issues/7591).
orbit-channel: [ 'stable' ]
osqueryd-channel: [ 'stable' ]
desktop-channel: [ 'stable' ]
runs-on: macos-latest
needs: [gen, login]
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Checkout Code
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
- name: Install dependencies
run: |
npm install -g fleetctl
fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}
- name: Wait until fleet address is reachable and fleet responds
run: |
until curl -v -fail ${{ needs.gen.outputs.address }}/version;
do
echo "Awaiting until fleet server responds..."
sleep 10
done
- name: Install Orbit
run: |
sudo hostname macos-orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}
SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
echo $SECRET_JSON
SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
echo "Secret: $SECRET"
echo "Hostname: $(hostname -s)"
fleetctl package --type pkg --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
sudo installer -pkg fleet-osquery.pkg -target /
ENROLLMENT_START=$(date +%s)
until fleetctl get hosts | grep -iF $(hostname -s);
do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo "Awaiting enrollment... (${ELAPSED}s)"
sleep 10
done
- name: Collect orbit logs
if: always()
run: |
mkdir orbit-logs
sudo cp /var/log/orbit/* orbit-logs/
- name: Upload Orbit logs
if: always()
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: orbit-macos-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
path: |
orbit-logs
- name: Uninstall Orbit
run: |
sudo ./it-and-security/lib/macos/scripts/uninstall-fleetd-macos.sh
orbit-ubuntu:
timeout-minutes: 10
strategy:
matrix:
# To run multiple VMs that have the same UUID we need to implement
# https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
# in the same host are enrolled as two hosts in Fleet).
# Until then we will just test the `stable` channel in all components.
orbit-channel: [ 'stable' ]
osqueryd-channel: [ 'stable' ]
desktop-channel: [ 'stable' ]
runs-on: ubuntu-latest
needs: [gen, login]
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Install dependencies
run: |
npm install -g fleetctl
fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}
- name: Checkout Code
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
- name: Install Go
uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe # v4.1.0
with:
go-version-file: 'go.mod'
- name: Build Fleetctl
run: make fleetctl
- name: Wait until fleet address is reachable and fleet responds
run: |
until curl -v -fail ${{ needs.gen.outputs.address }}/version; do
echo "Awaiting until fleet server responds..."
sleep 10
done
- name: Install Orbit
run: |
sudo hostname ubuntu-orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}
chmod +x ./build/fleetctl
SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
echo $SECRET_JSON
SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
echo "Secret: $SECRET"
echo "Hostname: $(hostname -s)"
./build/fleetctl package --type deb --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
sudo dpkg -i fleet-osquery*
ENROLLMENT_START=$(date +%s)
until fleetctl get hosts | grep -iF $(hostname -s); do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo "Awaiting enrollment... (${ELAPSED}s)"
sudo systemctl status orbit.service || true
sleep 10
done
- name: Collect orbit logs
if: always()
run: |
sudo journalctl -u orbit.service > orbit-logs
- name: Upload Orbit logs
if: always()
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: orbit-ubuntu-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
path: |
orbit-logs
- name: Uninstall Orbit
run: |
sudo apt remove fleet-osquery -y
orbit-windows-build:
timeout-minutes: 10
strategy:
matrix:
# To run multiple VMs that have the same UUID we need to implement
# https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
# in the same host are enrolled as two hosts in Fleet).
# Until then we will just test the `stable` channel in all components.
orbit-channel: [ 'stable' ]
osqueryd-channel: [ 'stable' ]
desktop-channel: [ 'stable' ]
runs-on: ubuntu-latest
needs: [gen, login]
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Install dependencies
run: |
docker pull fleetdm/wix:latest &
npm install -g fleetctl
fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}
- name: Wait until fleet address is reachable and fleet responds
run: |
until curl -v -fail ${{ needs.gen.outputs.address }}/version;
do
echo "Awaiting until fleet server responds..."
sleep 10
done
- name: Build Orbit
run: |
SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
echo $SECRET_JSON
SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
echo "Secret: $SECRET"
echo "Hostname: $(hostname -s)"
fleetctl package --type msi --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
mv fleet-osquery.msi orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi
- name: Upload MSI
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi
path: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi
orbit-windows:
timeout-minutes: 10
strategy:
matrix:
# To run multiple VMs that have the same UUID we need to implement
# https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
# in the same host are enrolled as two hosts in Fleet).
# Until then we will just test the `stable` channel in all components.
orbit-channel: [ 'stable' ]
osqueryd-channel: [ 'stable' ]
desktop-channel: [ 'stable' ]
needs: [gen, login, orbit-windows-build]
runs-on: windows-latest
steps:
- name: Harden Runner
uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
with:
egress-policy: audit
- name: Install dependencies
shell: bash
run: |
npm install -g fleetctl
fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }} --tls-skip-verify
- name: Download MSI
id: download
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi
- name: Install Orbit
shell: cmd
run: |
msiexec /i ${{steps.download.outputs.download-path}}\orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi /quiet /passive /lv log.txt
sleep 120
# We can't very accurately check the install on these Windows hosts since the hostnames tend to
# overlap and we can't control the hostnames. Instead we just return and have the run-server job
# wait until the expected number of hosts enroll.
- name: Upload orbit install log
if: always()
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: msiexec-install-log
path: log.txt
- name: Upload Orbit logs
if: always()
uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
with:
name: orbit-windows-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
path: C:\Windows\system32\config\systemprofile\AppData\Local\FleetDM\Orbit\Logs\orbit-osquery.log