# This workflow tests enrolling of agents on the supported platforms,
# using the latest version of fleet, fleetctl and orbit.
#
# It starts the latest release of fleet with the "fleetctl preview" command.
# It generates the installers for the latest version of Orbit with the
# "fleetctl package" command.
name: Test Fleetctl, Orbit & Preview

on:
  workflow_dispatch: # Manual
  schedule:
  - cron: '0 2 * * *' # Nightly 2AM UTC

# This allows a subsequently queued workflow run to interrupt previous runs
concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id}}
  cancel-in-progress: true

defaults:
  run:
    # fail-fast using bash -eo pipefail. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
    shell: bash

permissions:
  contents: read

jobs:
  gen:
    runs-on: ubuntu-latest
    outputs:
      subdomain: ${{ steps.gen.outputs.subdomain }}
      address: ${{ steps.gen.outputs.address }}
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - id: gen
      run: |
        UUID=$(uuidgen)
        echo "subdomain=fleet-test-$UUID" >> $GITHUB_OUTPUT
        echo "address=https://fleet-test-$UUID.fleetuem.com" >> $GITHUB_OUTPUT

  run-server:
    runs-on: ubuntu-latest
    needs: gen
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Start tunnel
      env:
        CERT_PEM: ${{ secrets.CLOUDFLARE_TUNNEL_FLEETUEM_CERT_B64 }}
      run: |
        # Increase maximum receive buffer size to roughly 2.5 MB.
        # Cloudflared uses quic-go. This buffer holds packets that have been received by the kernel,
        # but not yet read by the application (quic-go in this case). Once this buffer fills up, the
        # kernel will drop any new incoming packet.
        # See https://github.com/quic-go/quic-go/wiki/UDP-Receive-Buffer-Size.
        sudo sysctl -w net.core.rmem_max=2500000

        # Install cloudflared
        #
        # We pin to version 2025.5.0 because something broke with 2025.6.1.
        # 2025.6.1 fails with "failed to create tunnel: Unknown output format 'default'"
        wget https://github.com/cloudflare/cloudflared/releases/download/2025.5.0/cloudflared-linux-amd64.deb
        sudo dpkg -i cloudflared-linux-amd64.deb

        # Add secret
        echo "$CERT_PEM" | base64 -d > cert.pem
        # Start tunnel
        cloudflared tunnel --origincert cert.pem --hostname ${{ needs.gen.outputs.subdomain }} --url http://localhost:1337 --name ${{ needs.gen.outputs.subdomain }} --logfile cloudflared.log &
        until [[ $(cloudflared tunnel --origincert cert.pem info -o json ${{ needs.gen.outputs.subdomain }} | jq '.conns[0].conns[0].is_pending_reconnect') = false ]]; do
          echo "Awaiting tunnel ready..."
          sleep 5
        done

    # Download fleet and fleetctl binaries from last successful build on main
    - name: Download binaries
      uses: dawidd6/action-download-artifact@5e780fc7bbd0cac69fc73271ed86edf5dcb72d67
      with:
        workflow: build-binaries.yaml
        branch: main
        name: build
        path: build
        check_artifacts: true

    - name: Run Fleet server
      timeout-minutes: 15
      run: |
        chmod +x ./build/fleetctl
        ./build/fleetctl preview --no-hosts --disable-open-browser
        ./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
        ./build/fleetctl get enroll-secret
        docker compose -f ~/.fleet/preview/docker-compose.yml logs --follow fleet01 fleet02 &
        
        # Ensure Fleet server is responding before waiting for enrollments
        echo "Checking Fleet server health..."
        HEALTH_CHECK_COUNT=0
        until curl -s -o /dev/null -w "%{http_code}" http://localhost:1337/healthz | grep -q "200"; do
          HEALTH_CHECK_COUNT=$((HEALTH_CHECK_COUNT + 1))
          if [ $HEALTH_CHECK_COUNT -ge 30 ]; then
            echo "ERROR: Fleet server not responding after 150 seconds"
            docker ps -a --filter "name=fleet"
            exit 1
          fi
          sleep 5
        done
        echo "Fleet server is responding"
        
        # Wait for all of the hosts to be enrolled
        EXPECTED=3
        ENROLLMENT_START=$(date +%s)
        until [ $(./build/fleetctl get hosts --json | grep -v "No hosts found" | wc -l | tee hostcount) -ge $EXPECTED ]; do
          CURRENT_TIME=$(date +%s)
          ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
          echo -n "Waiting for hosts to enroll (${ELAPSED}s): "
          cat hostcount | xargs echo -n
          echo " / $EXPECTED"
          
          # Show diagnostic info every 60 seconds
          if [ $((ELAPSED % 60)) -lt 10 ]; then
            ./build/fleetctl get hosts --json || true
          fi
          
          sleep 10
        done
        echo "Success! $EXPECTED hosts enrolled."

    - name: Show enrolled hosts
      if: always()
      run: |
        ./build/fleetctl get hosts --json

    - name: Slack Notification
      if: failure()
      uses: slackapi/slack-github-action@e28cf165c92ffef168d23c5c9000cffc8a25e117 # v1.24.0
      with:
        payload: |
          {
            "text": "${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head.html_url }}",
            "blocks": [
              {
                "type": "section",
                "text": {
                  "type": "mrkdwn",
                  "text": "Integration test result: ${{ job.status }}\nhttps://github.com/fleetdm/fleet/actions/runs/${{  github.run_id }}\n${{ github.event.pull_request.html_url || github.event.head.html_url }}"
                }
              }
            ]
          }
      env:
        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_G_HELP_ENGINEERING_WEBHOOK_URL }}
        SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK

    - name: Cleanup tunnel
      if: always()
      run: cloudflared tunnel --origincert cert.pem delete --force ${{ needs.gen.outputs.subdomain }}

    - name: Upload cloudflared logs
      if: always()
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: cloudflared.log
        path: cloudflared.log

  login:
   runs-on: ubuntu-latest
   needs: gen
   outputs:
     token: ${{ steps.login.outputs.token }}
   steps:
    # Download fleet and fleetctl binaries from last successful build on main
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Download binaries
      uses: dawidd6/action-download-artifact@5e780fc7bbd0cac69fc73271ed86edf5dcb72d67
      with:
        workflow: build-binaries.yaml
        branch: main
        name: build
        path: build
        check_artifacts: true

    # Login only here and share the token because otherwise we could hit rate limits.
    - id: login
      name: Attempt login
      timeout-minutes: 5
      run: |
        chmod +x ./build/fleetctl
        ./build/fleetctl config set --address ${{ needs.gen.outputs.address }}
        
        # Wait for Fleet server to be reachable first
        echo "Waiting for Fleet server to be ready..."
        ATTEMPT=0
        until curl -s -o /dev/null -w "%{http_code}" ${{ needs.gen.outputs.address }}/healthz | grep -q "200"; do
          ATTEMPT=$((ATTEMPT + 1))
          if [ $ATTEMPT -ge 60 ]; then
            echo "ERROR: Fleet server not reachable after 5 minutes"
            exit 1
          fi
          echo "Waiting for server... attempt $ATTEMPT/60"
          sleep 5
        done
        echo "Fleet server is responding, attempting login..."
        
        ATTEMPT=0
        until ./build/fleetctl login --email admin@example.com --password preview1337#; do
          ATTEMPT=$((ATTEMPT + 1))
          if [ $ATTEMPT -ge 30 ]; then
            echo "ERROR: Failed to login after $ATTEMPT attempts"
            exit 1
          fi
          echo "Login attempt $ATTEMPT failed, retrying in 5s..."
          sleep 5
        done
        TOKEN=$(cat ~/.fleet/config| grep token | awk '{ print $2 }')
        echo "token=$TOKEN" >> $GITHUB_OUTPUT

  orbit-macos:
    timeout-minutes: 10
    strategy:
      matrix:
        # To run multiple VMs that have the same UUID we need to implement
        # https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
        # in the same host are enrolled as two hosts in Fleet).
        # Until then we will just test the `stable` channel in all components.
        #
        # Alternatively, we can bring back the `edge` channel when we decide to upgrade
        # our worker to macOS 13 in the future, as they changed the virtualization
        # layer for 13 and now it has random UUIDs (https://github.com/actions/runner-images/issues/7591).
        orbit-channel: [ 'stable' ]
        osqueryd-channel: [ 'stable' ]
        desktop-channel: [ 'stable' ]
    runs-on: macos-latest
    needs: [gen, login]
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Checkout Code
      uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3

    - name: Install dependencies
      run: |
        npm install -g fleetctl
        fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}

    - name: Wait until fleet address is reachable and fleet responds
      run: |
        until curl -v -fail ${{ needs.gen.outputs.address }}/version;
        do
          echo "Awaiting until fleet server responds..."
          sleep 10
        done

    - name: Install Orbit
      run: |
        sudo hostname macos-orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}
        SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
        echo $SECRET_JSON
        SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
        echo "Secret: $SECRET"
        echo "Hostname: $(hostname -s)"
        fleetctl package --type pkg --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
        sudo installer -pkg fleet-osquery.pkg -target /
        ENROLLMENT_START=$(date +%s)
        until fleetctl get hosts | grep -iF $(hostname -s);
        do
          CURRENT_TIME=$(date +%s)
          ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
          echo "Awaiting enrollment... (${ELAPSED}s)"
          sleep 10
        done

    - name: Collect orbit logs
      if: always()
      run: |
        mkdir orbit-logs
        sudo cp /var/log/orbit/* orbit-logs/

    - name: Upload Orbit logs
      if: always()
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: orbit-macos-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
        path: |
          orbit-logs

    - name: Uninstall Orbit
      run: |
        sudo ./it-and-security/lib/macos/scripts/uninstall-fleetd-macos.sh

  orbit-ubuntu:
    timeout-minutes: 10
    strategy:
      matrix:
        # To run multiple VMs that have the same UUID we need to implement
        # https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
        # in the same host are enrolled as two hosts in Fleet).
        # Until then we will just test the `stable` channel in all components.
        orbit-channel: [ 'stable' ]
        osqueryd-channel: [ 'stable' ]
        desktop-channel: [ 'stable' ]
    runs-on: ubuntu-latest
    needs: [gen, login]
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Install dependencies
      run: |
        npm install -g fleetctl
        fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}

    - name: Checkout Code
      uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3

    - name: Install Go
      uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe # v4.1.0
      with:
        go-version-file: 'go.mod'

    - name: Build Fleetctl
      run: make fleetctl

    - name: Wait until fleet address is reachable and fleet responds
      run: |
        until curl -v -fail ${{ needs.gen.outputs.address }}/version; do
          echo "Awaiting until fleet server responds..."
          sleep 10
        done

    - name: Install Orbit
      run: |
        sudo hostname ubuntu-orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}
        chmod +x ./build/fleetctl
        SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
        echo $SECRET_JSON
        SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
        echo "Secret: $SECRET"
        echo "Hostname: $(hostname -s)"
        ./build/fleetctl package --type deb --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET  --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
        sudo dpkg -i fleet-osquery*
        ENROLLMENT_START=$(date +%s)
        until fleetctl get hosts | grep -iF $(hostname -s); do
          CURRENT_TIME=$(date +%s)
          ELAPSED=$((CURRENT_TIME - ENROLLMENT_START))
          echo "Awaiting enrollment... (${ELAPSED}s)"
          sudo systemctl status orbit.service || true
          sleep 10
        done

    - name: Collect orbit logs
      if: always()
      run: |
        sudo journalctl -u orbit.service > orbit-logs

    - name: Upload Orbit logs
      if: always()
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: orbit-ubuntu-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
        path: |
          orbit-logs

    - name: Uninstall Orbit
      run: |
        sudo apt remove fleet-osquery -y

  orbit-windows-build:
    timeout-minutes: 10
    strategy:
      matrix:
        # To run multiple VMs that have the same UUID we need to implement
        # https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
        # in the same host are enrolled as two hosts in Fleet).
        # Until then we will just test the `stable` channel in all components.
        orbit-channel: [ 'stable' ]
        osqueryd-channel: [ 'stable' ]
        desktop-channel: [ 'stable' ]
    runs-on: ubuntu-latest
    needs: [gen, login]
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Install dependencies
      run: |
        docker pull fleetdm/wix:latest &
        npm install -g fleetctl
        fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }}

    - name: Wait until fleet address is reachable and fleet responds
      run: |
        until curl -v -fail ${{ needs.gen.outputs.address }}/version;
        do
          echo "Awaiting until fleet server responds..."
          sleep 10
        done

    - name: Build Orbit
      run: |
        SECRET_JSON=$(fleetctl get enroll_secret --json --debug)
        echo $SECRET_JSON
        SECRET=$(echo $SECRET_JSON | jq -r '.spec.secrets[0].secret')
        echo "Secret: $SECRET"
        echo "Hostname: $(hostname -s)"
        fleetctl package --type msi --fleet-url=${{ needs.gen.outputs.address }} --enroll-secret=$SECRET --orbit-channel=${{ matrix.orbit-channel }} --osqueryd-channel=${{ matrix.osqueryd-channel }} --desktop-channel=${{ matrix.desktop-channel }} --fleet-desktop --debug
        mv fleet-osquery.msi orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi

    - name: Upload MSI
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi
        path: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi

  orbit-windows:
    timeout-minutes: 10
    strategy:
      matrix:
        # To run multiple VMs that have the same UUID we need to implement
        # https://github.com/fleetdm/fleet/issues/8021 (otherwise orbit and osqueryd
        # in the same host are enrolled as two hosts in Fleet).
        # Until then we will just test the `stable` channel in all components.
        orbit-channel: [ 'stable' ]
        osqueryd-channel: [ 'stable' ]
        desktop-channel: [ 'stable' ]
    needs: [gen, login, orbit-windows-build]
    runs-on: windows-latest
    steps:
    - name: Harden Runner
      uses: step-security/harden-runner@63c24ba6bd7ba022e95695ff85de572c04a18142 # v2.7.0
      with:
        egress-policy: audit

    - name: Install dependencies
      shell: bash
      run: |
        npm install -g fleetctl
        fleetctl config set --address ${{ needs.gen.outputs.address }} --token ${{ needs.login.outputs.token }} --tls-skip-verify

    - name: Download MSI
      id: download
      uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
      with:
        name: orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi

    - name: Install Orbit
      shell: cmd
      run: |
        msiexec /i ${{steps.download.outputs.download-path}}\orbit-${{ matrix.orbit-channel }}-osqueryd-${{ matrix.osqueryd-channel }}-desktop-${{ matrix.desktop-channel }}.msi /quiet /passive /lv log.txt
        sleep 120

    # We can't very accurately check the install on these Windows hosts since the hostnames tend to
    # overlap and we can't control the hostnames. Instead we just return and have the run-server job
    # wait until the expected number of hosts enroll.

    - name: Upload orbit install log
      if: always()
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: msiexec-install-log
        path: log.txt

    - name: Upload Orbit logs
      if: always()
      uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
      with:
        name: orbit-windows-${{ matrix.orbit-channel }}-${{ matrix.osqueryd-channel }}-${{ matrix.desktop-channel }}-logs
        path: C:\Windows\system32\config\systemprofile\AppData\Local\FleetDM\Orbit\Logs\orbit-osquery.log