Improve integration workflow robustness with health checks and detailed enrollment logging. (#32348)

Fixes #32347

# Checklist for submitter

## Testing

- [x] QA'd all new/changed functionality manually


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- New Features
- Added health checks and elapsed-time logging during server startup and
host enrollment in the integration workflow.
- Bug Fixes
- Reduced flakiness by adding bounded login retries and server readiness
verification before proceeding.
- Tests
- Periodic diagnostics for host enrollment status to aid visibility
during runs.
- Chores
- Increased server startup timeout from 10 to 15 minutes in the
integration workflow.
  - Minor workflow formatting cleanups for consistency.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Victor Lyuboslavsky 2025-08-27 14:52:48 -05:00 committed by GitHub
parent c920007851
commit 1d9131a602
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -89,20 +89,43 @@ jobs:
check_artifacts: true
- name: Run Fleet server
timeout-minutes: 10
timeout-minutes: 15
run: |
chmod +x ./build/fleetctl
./build/fleetctl preview --no-hosts --disable-open-browser
./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
./build/fleetctl get enroll-secret
docker compose -f ~/.fleet/preview/docker-compose.yml logs --follow fleet01 fleet02 &
# Ensure Fleet server is responding before waiting for enrollments
echo "Checking Fleet server health..."
HEALTH_CHECK_COUNT=0
until curl -s -o /dev/null -w "%{http_code}" http://localhost:1337/healthz | grep -q "200"; do
HEALTH_CHECK_COUNT=$((HEALTH_CHECK_COUNT + 1))
if [ $HEALTH_CHECK_COUNT -ge 30 ]; then
echo "ERROR: Fleet server not responding after 150 seconds"
docker ps -a --filter "name=fleet"
exit 1
fi
sleep 5
done
echo "Fleet server is responding"
# Wait for all of the hosts to be enrolled
EXPECTED=3
ENROLLMENT_START=$(date +%s)
until [ $(./build/fleetctl get hosts --json | grep -v "No hosts found" | wc -l | tee hostcount) -ge $EXPECTED ]; do
echo -n "Waiting for hosts to enroll: "
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo -n "Waiting for hosts to enroll (${ELAPSED}s): "
cat hostcount | xargs echo -n
echo " / $EXPECTED"
./build/fleetctl get hosts --json
# Show diagnostic info every 60 seconds
if [ $((ELAPSED % 60)) -lt 10 ]; then
./build/fleetctl get hosts --json || true
fi
sleep 10
done
echo "Success! $EXPECTED hosts enrolled."
@ -172,9 +195,29 @@ jobs:
run: |
chmod +x ./build/fleetctl
./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
until ./build/fleetctl login --email admin@example.com --password preview1337#
do
echo "Retrying in 5s..."
# Wait for Fleet server to be reachable first
echo "Waiting for Fleet server to be ready..."
ATTEMPT=0
until curl -s -o /dev/null -w "%{http_code}" ${{ needs.gen.outputs.address }}/healthz | grep -q "200"; do
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -ge 60 ]; then
echo "ERROR: Fleet server not reachable after 5 minutes"
exit 1
fi
echo "Waiting for server... attempt $ATTEMPT/60"
sleep 5
done
echo "Fleet server is responding, attempting login..."
ATTEMPT=0
until ./build/fleetctl login --email admin@example.com --password preview1337#; do
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -ge 30 ]; then
echo "ERROR: Failed to login after $ATTEMPT attempts"
exit 1
fi
echo "Login attempt $ATTEMPT failed, retrying in 5s..."
sleep 5
done
TOKEN=$(cat ~/.fleet/config| grep token | awk '{ print $2 }')
@ -229,9 +272,12 @@ jobs:
echo "Hostname: $(hostname -s)"
fleetctl package --type pkg --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
sudo installer -pkg fleet-osquery.pkg -target /
ENROLLMENT_START=$(date +%s)
until fleetctl get hosts | grep -iF $(hostname -s);
do
echo "Awaiting enrollment..."
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo "Awaiting enrollment... (${ELAPSED}s)"
sleep 10
done
@ -290,8 +336,7 @@ jobs:
- name: Wait until fleet address is reachable and fleet responds
run: |
until curl -v -fail ${{ needs.gen.outputs.address }}/version;
do
until curl -v -fail ${{ needs.gen.outputs.address }}/version; do
echo "Awaiting until fleet server responds..."
sleep 10
done
@ -307,9 +352,11 @@ jobs:
echo "Hostname: $(hostname -s)"
./build/fleetctl package --type deb --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
sudo dpkg -i fleet-osquery*
until fleetctl get hosts | grep -iF $(hostname -s);
do
echo "Awaiting enrollment..."
ENROLLMENT_START=$(date +%s)
until fleetctl get hosts | grep -iF $(hostname -s); do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
echo "Awaiting enrollment... (${ELAPSED}s)"
sudo systemctl status orbit.service || true
sleep 10
done