fleet/tools/fleet-mcp/fleet_integration.go

package main

import (
	"bytes"
	"context"
	"crypto/rand"
	"crypto/tls"
	"crypto/x509"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"os"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/sirupsen/logrus"
)

// teamFanOutConcurrency caps the number of in-flight per-team API calls during
// GetQueries / GetPolicies. Sequential per-team calls scale O(N teams) and
// dominate the latency on enterprise Fleets with 50+ teams; concurrency lets
// us amortize the round-trip count without overwhelming Fleet with thousands
// of simultaneous requests.
const teamFanOutConcurrency = 8

// tempQueryNamePrefix is the prefix used by all transient saved queries created
// by runMultiHostQuery. Sweeping leftover queries at startup uses this prefix
// to find them.
const tempQueryNamePrefix = "fleet-mcp-temp-"

// randomHexSuffix returns a hex-encoded random string for unique temp-query
// names. Falls back to time.Now().UnixNano() if crypto/rand is unavailable
// (extremely unlikely, but the fallback keeps runMultiHostQuery functional).
func randomHexSuffix(nBytes int) string {
	b := make([]byte, nBytes)
	if _, err := rand.Read(b); err != nil {
		return strconv.FormatInt(time.Now().UnixNano(), 16)
	}
	return hex.EncodeToString(b)
}

// FleetClient represents a client for interacting with Fleet API
type FleetClient struct {
	baseURL    string
	apiKey     string
	httpClient *http.Client
}

// PlatformBreakdown represents platform distribution data
type PlatformBreakdown struct {
	MacOS    int `json:"macos"`
	Windows  int `json:"windows"`
	Linux    int `json:"linux"`
	ChromeOS int `json:"chromeos"`
	IOS      int `json:"ios"`
	IPadOS   int `json:"ipados"`
	Android  int `json:"android"`
	Other    int `json:"other"`
	Total    int `json:"total"`
}

// PolicyCompliance represents policy compliance data
type PolicyCompliance struct {
	PolicyID     string `json:"policy_id"`
	PolicyName   string `json:"policy_name"`
	Total        int    `json:"total"`
	Compliant    int    `json:"compliant"`
	NonCompliant int    `json:"non_compliant"`
}

// VulnerabilityImpact represents vulnerability impact data
type VulnerabilityImpact struct {
	CVEID           string `json:"cve_id"`
	TotalSystems    int    `json:"total_systems"`
	ImpactedSystems int    `json:"impacted_systems"`
	// Truncated is true when ImpactedSystems is a lower bound — at least
	// one per-version-id fan-out hit fetchHostsHardCap, so the actual
	// impact may be larger. Operators should tighten filters or raise the
	// cap to get an exact count.
	Truncated bool `json:"truncated,omitempty"`
}

// AggregateResponse represents a consistent response format for aggregations
type AggregateResponse struct {
	Count int         `json:"count"`
	Data  interface{} `json:"data"`
}

// NewFleetClient creates a new Fleet client.
// tlsSkipVerify disables certificate verification (unsafe; use only in dev/test).
// caFile, if non-empty, is a path to a PEM-encoded CA certificate to trust (for self-signed certs).
func NewFleetClient(baseURL, apiKey string, tlsSkipVerify bool, caFile string) *FleetClient {
	tlsCfg := &tls.Config{}

	if tlsSkipVerify && caFile != "" {
		logrus.Fatalf("conflicting TLS settings: tlsSkipVerify and caFile are mutually exclusive — use one or the other, not both")
	}

	if tlsSkipVerify {
		// Hard gate: refuse to start when FLEET_TLS_SKIP_VERIFY is paired with a
		// non-loopback URL. Allowing this on a remote URL means an on-path attacker
		// can present any TLS cert and capture the admin Fleet API token in one
		// handshake. Localhost is the only safe context for skip-verify.
		if !isLoopbackURL(baseURL) {
			logrus.Fatalf("FLEET_TLS_SKIP_VERIFY=true is only allowed when FLEET_BASE_URL points at localhost (got %s); refuse to start. Remove FLEET_TLS_SKIP_VERIFY or set FLEET_CA_FILE to a trusted PEM instead.", baseURL)
		}
		logrus.Warn("TLS certificate verification is disabled — localhost only; do not use in production")
		tlsCfg.InsecureSkipVerify = true //nolint:gosec
	} else if caFile != "" {
		pemData, err := os.ReadFile(caFile)
		if err != nil {
			logrus.Fatalf("failed to read CA certificate file %s: %v", caFile, err)
		}
		certPool := x509.NewCertPool()
		if !certPool.AppendCertsFromPEM(pemData) {
			logrus.Fatalf("failed to parse CA certificate from %s", caFile)
		}
		tlsCfg.RootCAs = certPool
		logrus.Infof("loaded custom CA certificate from %s", caFile)
	}

	transport := &http.Transport{TLSClientConfig: tlsCfg}
	return &FleetClient{
		baseURL: baseURL,
		apiKey:  apiKey,
		httpClient: &http.Client{
			Timeout:   30 * time.Second,
			Transport: transport,
		},
	}
}

// isLoopbackURL parses a URL and returns true only if the hostname is exactly
// "localhost", "127.0.0.1", or "::1". This avoids prefix-matching pitfalls
// like "localhost.evil.com".
func isLoopbackURL(rawURL string) bool {
	u, err := url.Parse(rawURL)
	if err != nil {
		return false
	}
	host := u.Hostname() // strips port if present
	return host == "localhost" || host == "127.0.0.1" || host == "::1"
}

// HostLabel represents a label attached to a host (Fleet returns objects, not plain strings)
type HostLabel struct {
	ID   uint   `json:"id"`
	Name string `json:"name"`
}

// Endpoint represents a Fleet endpoint
type Endpoint struct {
	ID             uint        `json:"id"`
	Name           string      `json:"hostname"`
	DisplayName    string      `json:"display_name"`
	ComputerName   string      `json:"computer_name"`
	Status         string      `json:"status"`
	LastSeen       int64       `json:"last_seen"`
	Platform       string      `json:"platform"`
	Version        string      `json:"osquery_version"`
	HardwareSerial string      `json:"hardware_serial"`
	PrimaryIP      string      `json:"primary_ip"`
	TeamID         *uint       `json:"team_id"`
	TeamName       string      `json:"team_name"`
	Labels         []HostLabel `json:"labels"`
}

// Query represents a Fleet query
type Query struct {
	ID          uint   `json:"id"`
	Name        string `json:"name"`
	Description string `json:"description"`
	SQL         string `json:"sql"`
	Platform    string `json:"platform"`
	Created     int64  `json:"created"`
}

// Policy represents a Fleet policy
type Policy struct {
	ID               uint   `json:"id"`
	Name             string `json:"name"`
	Description      string `json:"description"`
	Platform         string `json:"platform"`
	PassingHostCount int    `json:"passing_host_count"`
	FailingHostCount int    `json:"failing_host_count"`
}

// Label represents a Fleet label
type Label struct {
	ID          uint   `json:"id"`
	Name        string `json:"name"`
	Description string `json:"description"`
	Created     int64  `json:"created"`
}

// HostPolicyEntry represents one policy result attached to a host when the
// hosts endpoint is called with populate_policies=true. The Response field
// is the per-host pass/fail outcome ("pass", "fail", or "" for not-yet-run).
type HostPolicyEntry struct {
	ID          uint   `json:"id"`
	Name        string `json:"name"`
	Query       string `json:"query"`
	Description string `json:"description"`
	Resolution  string `json:"resolution"`
	Platform    string `json:"platform"`
	Critical    bool   `json:"critical"`
	Response    string `json:"response"`
}

// HostWithPolicies is a host listing enriched with the per-host policy
// compliance array. Returned by GetHostByIdentifierWithPolicies.
//
// JSON shape preserves the existing Endpoint fields and adds a top-level
// "policies" array — backward compatible with consumers that only read
// Endpoint fields, additive for consumers that want the policies.
type HostWithPolicies struct {
	Endpoint
	Policies []HostPolicyEntry `json:"policies"`
}

// Team represents a Fleet team
type Team struct {
	ID          uint   `json:"id"`
	Name        string `json:"name"`
	Description string `json:"description"`
}

// AdHocQueryRequest is the body for single-host ad hoc queries
type AdHocQueryRequest struct {
	Query string `json:"query"`
}

// AdHocQueryResponse is the response from a single-host ad hoc query
type AdHocQueryResponse struct {
	HostID uint                     `json:"host_id"`
	Query  string                   `json:"query"`
	Status string                   `json:"status"`
	Error  *string                  `json:"error"`
	Rows   []map[string]interface{} `json:"rows"`
}

// MultiQueryRunRequest is the body for running a saved query against multiple hosts
type MultiQueryRunRequest struct {
	HostIDs []uint `json:"host_ids,omitempty"`
}

// LiveQueryHostResult is a single host's result from a multi-host query run
type LiveQueryHostResult struct {
	HostID uint                     `json:"host_id"`
	Rows   []map[string]interface{} `json:"rows"`
	Error  *string                  `json:"error"`
}

// MultiQueryRunResponse is the response from POST /api/v1/fleet/queries/:id/run
type MultiQueryRunResponse struct {
	QueryID            uint                  `json:"query_id"`
	TargetedHostCount  int                   `json:"targeted_host_count"`
	RespondedHostCount int                   `json:"responded_host_count"`
	Results            []LiveQueryHostResult `json:"results"`
}

// LiveQueryResult is a unified result returned from RunLiveQuery
type LiveQueryResult struct {
	TargetedHostCount  int                      `json:"targeted_host_count"`
	RespondedHostCount int                      `json:"responded_host_count"`
	Results            []map[string]interface{} `json:"results"`
}

// CreateQueryRequest represents the payload for creating a saved query.
//
// TeamID, when non-nil, scopes the query to a specific team (Fleet) — the
// query then appears under that team in the Fleet UI and inherits the team's
// RBAC. Nil leaves the query at the Global scope. The Fleet API treats the
// fields equivalently for execution; the difference matters for ownership,
// listing, and authorization.
type CreateQueryRequest struct {
	Name        string `json:"name"`
	Description string `json:"description,omitempty"`
	Query       string `json:"query"`
	Platform    string `json:"platform,omitempty"`
	TeamID      *uint  `json:"team_id,omitempty"`
}

// normalizePlatform normalizes platform input to Fleet's canonical platform string.
func normalizePlatform(p string) string {
	switch strings.ToLower(strings.TrimSpace(p)) {
	case "macos", "mac", "osx", "darwin":
		return "darwin"
	case "windows":
		return "windows"
	case "linux", "ubuntu", "centos", "rhel", "debian", "fedora", "amzn":
		return "linux"
	case "chromeos", "chrome":
		return "chrome"
	default:
		return strings.ToLower(p)
	}
}

// matchesPlatform checks if a host's platform matches the target platform.
func matchesPlatform(hostPlatform, targetPlatform string) bool {
	hp := strings.ToLower(hostPlatform)
	if targetPlatform == "linux" {
		return hp == "linux" || hp == "ubuntu" || hp == "centos" || hp == "rhel" || hp == "debian" || hp == "fedora" || hp == "amzn"
	}
	return hp == targetPlatform
}

// platformToBuiltinLabel maps user-facing platform names to Fleet's built-in label names.
func platformToBuiltinLabel(platform string) string {
	switch strings.ToLower(strings.TrimSpace(platform)) {
	case "macos", "darwin", "mac", "osx":
		return "macOS"
	case "windows":
		return "MS Windows"
	case "linux":
		return "All Linux"
	case "chromeos", "chrome":
		return "chrome"
	default:
		return ""
	}
}

// GetEndpoints retrieves endpoints from Fleet with server-side pagination.
// Pass 0 for perPage to use the Fleet API default.
func (fc *FleetClient) GetEndpoints(ctx context.Context, perPage int) ([]Endpoint, error) {
	params := url.Values{}
	params.Set("populate_labels", "true")
	if perPage > 0 {
		params.Set("per_page", fmt.Sprintf("%d", perPage))
	}
	endpoint := "/api/v1/fleet/hosts?" + params.Encode()
	resp, err := fc.makeFleetRequest(ctx, "GET", endpoint, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get endpoints: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get endpoints: status code %d", resp.StatusCode)
	}

	var result struct {
		Hosts []Endpoint `json:"hosts"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode endpoints response: %w", err)
	}

	return result.Hosts, nil
}

// GetHostByIdentifier fetches full host details (including labels) by hostname, UUID, or serial.
// Uses GET /api/v1/fleet/hosts/identifier/:identifier which returns complete host and label data.
// Note: GetEndpoints already requests labels via populate_labels=true; this method is for targeted lookups of a single host.
func (fc *FleetClient) GetHostByIdentifier(ctx context.Context, identifier string) (*Endpoint, error) {
	endpointPath := fmt.Sprintf("/api/v1/fleet/hosts/identifier/%s", url.PathEscape(identifier))
	resp, err := fc.makeFleetRequest(ctx, "GET", endpointPath, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get host: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode == http.StatusNotFound {
		return nil, fmt.Errorf("host not found: %s", identifier)
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get host: status code %d", resp.StatusCode)
	}

	var result struct {
		Host Endpoint `json:"host"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode host response: %w", err)
	}
	return &result.Host, nil
}

// GetHostByID fetches a host by its numeric Fleet ID. Wraps
// GET /api/v1/fleet/hosts/:host_id. Use this when the caller already has a
// concrete host_id (e.g. from a prior candidate list) — the identifier
// endpoint can silently return the wrong host when multiple hosts share a
// hostname, but :host_id is unambiguous.
func (fc *FleetClient) GetHostByID(ctx context.Context, hostID uint) (*Endpoint, error) {
	endpointPath := fmt.Sprintf("/api/v1/fleet/hosts/%d", hostID)
	resp, err := fc.makeFleetRequest(ctx, "GET", endpointPath, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get host by id: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode == http.StatusNotFound {
		return nil, fmt.Errorf("host not found: id=%d", hostID)
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get host by id: status code %d", resp.StatusCode)
	}

	var result struct {
		Host Endpoint `json:"host"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode host by id response: %w", err)
	}
	return &result.Host, nil
}

// GetHostByIDWithPolicies fetches a host by numeric ID together with the
// per-host policy compliance array. Same disambiguation guarantee as
// GetHostByID — :host_id never collides on shared hostnames.
func (fc *FleetClient) GetHostByIDWithPolicies(ctx context.Context, hostID uint) (*HostWithPolicies, error) {
	endpointPath := fmt.Sprintf("/api/v1/fleet/hosts/%d?populate_policies=true", hostID)
	resp, err := fc.makeFleetRequest(ctx, "GET", endpointPath, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get host with policies by id: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode == http.StatusNotFound {
		return nil, fmt.Errorf("host not found: id=%d", hostID)
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get host with policies by id: status code %d", resp.StatusCode)
	}

	var result struct {
		Host HostWithPolicies `json:"host"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode host with policies by id response: %w", err)
	}
	return &result.Host, nil
}

// GetHostByIdentifierWithPolicies fetches a host's full details together with
// every policy that applies to it (global + fleet-inherited), each entry
// carrying its pass/fail/empty Response. Wraps GET
// /api/v1/fleet/hosts/identifier/:identifier?populate_policies=true.
//
// This is the single API call behind the per-host "Policies" tab in the
// Fleet UI — answer "is this host compliant?" with one call instead of
// listing all policies and scanning by host.
func (fc *FleetClient) GetHostByIdentifierWithPolicies(ctx context.Context, identifier string) (*HostWithPolicies, error) {
	endpointPath := fmt.Sprintf("/api/v1/fleet/hosts/identifier/%s?populate_policies=true", url.PathEscape(identifier))
	resp, err := fc.makeFleetRequest(ctx, "GET", endpointPath, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get host with policies: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode == http.StatusNotFound {
		return nil, fmt.Errorf("host not found: %s", identifier)
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get host with policies: status code %d", resp.StatusCode)
	}

	var result struct {
		Host HostWithPolicies `json:"host"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode host with policies response: %w", err)
	}
	return &result.Host, nil
}

// GetQueries retrieves global and all team-specific queries from Fleet.
func (fc *FleetClient) GetQueries(ctx context.Context) ([]Query, error) {
	resp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/reports", nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get queries: %w", err)
	}
	defer resp.Body.Close()

	var result struct {
		Queries []Query `json:"queries"`
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get queries: status %d", resp.StatusCode)
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode queries: %w", err)
	}

	teams, err := fc.GetTeams(ctx)
	if err != nil {
		logrus.Warnf("skipping team queries: %v", err)
		return result.Queries, nil
	}
	// Concurrent per-team fan-out (bounded by teamFanOutConcurrency). Each
	// goroutine writes its slice into a shared chunk that is merged after
	// all workers finish — no shared-slice mutation, no lock contention.
	type chunk struct {
		idx     int
		queries []Query
	}
	chunks := make(chan chunk, len(teams))
	sem := make(chan struct{}, teamFanOutConcurrency)
	var wg sync.WaitGroup
	for i, team := range teams {
		wg.Add(1)
		sem <- struct{}{}
		go func(idx int, team Team) {
			defer wg.Done()
			defer func() { <-sem }()
			teamResp, err := fc.makeFleetRequest(ctx, "GET", fmt.Sprintf("/api/v1/fleet/reports?team_id=%d", team.ID), nil)
			if err != nil {
				logrus.Warnf("team %d queries error: %v", team.ID, err)
				return
			}
			defer teamResp.Body.Close()
			if teamResp.StatusCode != http.StatusOK {
				logrus.Warnf("team %d queries: status %d", team.ID, teamResp.StatusCode)
				return
			}
			var tr struct {
				Queries []Query `json:"queries"`
			}
			if derr := json.NewDecoder(teamResp.Body).Decode(&tr); derr != nil {
				logrus.Warnf("team %d queries decode failed: %v", team.ID, derr)
				return
			}
			for i := range tr.Queries {
				tr.Queries[i].Name = fmt.Sprintf("[%s] %s", team.Name, tr.Queries[i].Name)
			}
			chunks <- chunk{idx: idx, queries: tr.Queries}
		}(i, team)
	}
	wg.Wait()
	close(chunks)
	// Order-stable merge: collect chunks, then sort by team index so the
	// emitted slice has a deterministic shape per Fleet config.
	chunkSlice := make([]chunk, 0, len(teams))
	for c := range chunks {
		chunkSlice = append(chunkSlice, c)
	}
	sort.Slice(chunkSlice, func(i, j int) bool { return chunkSlice[i].idx < chunkSlice[j].idx })
	for _, c := range chunkSlice {
		result.Queries = append(result.Queries, c.queries...)
	}
	return result.Queries, nil
}

// GetPolicies retrieves global and all team-specific policies from Fleet.
func (fc *FleetClient) GetPolicies(ctx context.Context) ([]Policy, error) {
	resp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/global/policies", nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get policies: %w", err)
	}
	defer resp.Body.Close()

	var result struct {
		Policies []Policy `json:"policies"`
	}
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get policies: status %d", resp.StatusCode)
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode policies: %w", err)
	}

	teams, err := fc.GetTeams(ctx)
	if err != nil {
		logrus.Warnf("skipping team policies: %v", err)
		return result.Policies, nil
	}
	// Concurrent per-team fan-out (same pattern as GetQueries).
	type chunk struct {
		idx      int
		policies []Policy
	}
	chunks := make(chan chunk, len(teams))
	sem := make(chan struct{}, teamFanOutConcurrency)
	var wg sync.WaitGroup
	for i, team := range teams {
		wg.Add(1)
		sem <- struct{}{}
		go func(idx int, team Team) {
			defer wg.Done()
			defer func() { <-sem }()
			teamResp, err := fc.makeFleetRequest(ctx, "GET", fmt.Sprintf("/api/v1/fleet/teams/%d/policies", team.ID), nil)
			if err != nil {
				logrus.Warnf("team %d policies error: %v", team.ID, err)
				return
			}
			defer teamResp.Body.Close()
			if teamResp.StatusCode != http.StatusOK {
				logrus.Warnf("team %d policies: status %d", team.ID, teamResp.StatusCode)
				return
			}
			var tr struct {
				Policies []Policy `json:"policies"`
			}
			if derr := json.NewDecoder(teamResp.Body).Decode(&tr); derr != nil {
				logrus.Warnf("team %d policies decode failed: %v", team.ID, derr)
				return
			}
			for i := range tr.Policies {
				tr.Policies[i].Name = fmt.Sprintf("[%s] %s", team.Name, tr.Policies[i].Name)
			}
			chunks <- chunk{idx: idx, policies: tr.Policies}
		}(i, team)
	}
	wg.Wait()
	close(chunks)
	chunkSlice := make([]chunk, 0, len(teams))
	for c := range chunks {
		chunkSlice = append(chunkSlice, c)
	}
	sort.Slice(chunkSlice, func(i, j int) bool { return chunkSlice[i].idx < chunkSlice[j].idx })
	for _, c := range chunkSlice {
		result.Policies = append(result.Policies, c.policies...)
	}
	return result.Policies, nil
}

// GetLabels retrieves all labels from Fleet
func (fc *FleetClient) GetLabels(ctx context.Context) ([]Label, error) {
	endpoint := "/api/v1/fleet/labels"
	resp, err := fc.makeFleetRequest(ctx, "GET", endpoint, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get labels: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get labels: status code %d", resp.StatusCode)
	}

	var result struct {
		Labels []Label `json:"labels"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode labels response: %w", err)
	}

	return result.Labels, nil
}

// GetFleetConfig retrieves the Fleet server configuration.
func (fc *FleetClient) GetFleetConfig(ctx context.Context) (map[string]interface{}, error) {
	resp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/config", nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get fleet config: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get fleet config: status code %d", resp.StatusCode)
	}

	var result map[string]interface{}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode fleet config: %w", err)
	}
	return result, nil
}

// GetEndpointsWithAggregations returns the platform breakdown for the entire
// Fleet using /api/v1/fleet/host_summary, which Fleet computes server-side
// over the full inventory. The previous implementation called GetEndpoints(0)
// which silently truncated to Fleet's default 100-host page — wrong on any
// Fleet larger than that. host_summary is the correct dedicated endpoint.
func (fc *FleetClient) GetEndpointsWithAggregations(ctx context.Context) (*AggregateResponse, error) {
	resp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/host_summary", nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get host summary: %w", err)
	}
	defer resp.Body.Close()
	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get host summary: status %d", resp.StatusCode)
	}
	var summary struct {
		TotalsHostsCount int `json:"totals_hosts_count"`
		Platforms        []struct {
			Platform   string `json:"platform"`
			HostsCount int    `json:"hosts_count"`
		} `json:"platforms"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil {
		return nil, fmt.Errorf("failed to decode host summary: %w", err)
	}

	platformBreakdown := PlatformBreakdown{}
	for _, p := range summary.Platforms {
		switch p.Platform {
		case "darwin":
			platformBreakdown.MacOS += p.HostsCount
		case "windows":
			platformBreakdown.Windows += p.HostsCount
		case "linux", "ubuntu", "centos", "rhel", "debian", "fedora", "amzn":
			platformBreakdown.Linux += p.HostsCount
		case "chrome":
			platformBreakdown.ChromeOS += p.HostsCount
		case "ios":
			platformBreakdown.IOS += p.HostsCount
		case "ipados":
			platformBreakdown.IPadOS += p.HostsCount
		case "android":
			platformBreakdown.Android += p.HostsCount
		default:
			platformBreakdown.Other += p.HostsCount
		}
	}
	platformBreakdown.Total = summary.TotalsHostsCount

	return &AggregateResponse{
		Count: summary.TotalsHostsCount,
		Data: map[string]interface{}{
			"platform_breakdown": platformBreakdown,
			"total_count":        summary.TotalsHostsCount,
		},
	}, nil
}

// GetTeams retrieves all teams from Fleet
func (fc *FleetClient) GetTeams(ctx context.Context) ([]Team, error) {
	endpoint := "/api/v1/fleet/teams"
	resp, err := fc.makeFleetRequest(ctx, "GET", endpoint, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get teams: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get teams: status code %d", resp.StatusCode)
	}

	var result struct {
		Teams []Team `json:"teams"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode teams response: %w", err)
	}

	return result.Teams, nil
}

// GetHostCount retrieves the total host count without fetching all host data.
func (fc *FleetClient) GetHostCount(ctx context.Context) (int, error) {
	resp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/hosts/count", nil)
	if err != nil {
		return 0, fmt.Errorf("failed to get host count: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return 0, fmt.Errorf("failed to get host count: status %d", resp.StatusCode)
	}

	var result struct {
		Count int `json:"count"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return 0, fmt.Errorf("failed to decode host count response: %w", err)
	}
	return result.Count, nil
}

// resolveLabelName resolves a label name to its numeric ID using exact
// case-insensitive matching. On failure, lists available labels so the
// caller can retry. Mirrors resolveTeamNames — no caching, calls GetLabels()
// each invocation. Labels lists are small on dogfood so the cost is
// negligible and the code stays parallel with the team resolver.
func (fc *FleetClient) resolveLabelName(ctx context.Context, name string) (uint, error) {
	labels, err := fc.GetLabels(ctx)
	if err != nil {
		return 0, fmt.Errorf("failed to get labels: %w", err)
	}
	target := strings.ToLower(strings.TrimSpace(name))
	availableNames := make([]string, 0, len(labels))
	for _, l := range labels {
		availableNames = append(availableNames, l.Name)
		if strings.ToLower(l.Name) == target {
			return l.ID, nil
		}
	}
	return 0, fmt.Errorf("label not found: %q (available labels: %s)", name, strings.Join(availableNames, ", "))
}

// resolveTeamNames resolves team names to team IDs using exact case-insensitive
// matching. On failure, lists available teams so the caller can retry.
func (fc *FleetClient) resolveTeamNames(ctx context.Context, teamNames []string) ([]uint, error) {
	teams, err := fc.GetTeams(ctx)
	if err != nil {
		return nil, fmt.Errorf("failed to get teams: %w", err)
	}

	teamMap := make(map[string]uint)
	var availableNames []string
	for _, t := range teams {
		teamMap[strings.ToLower(t.Name)] = t.ID
		availableNames = append(availableNames, t.Name)
	}

	var ids []uint
	for _, name := range teamNames {
		id, ok := teamMap[strings.ToLower(strings.TrimSpace(name))]
		if !ok {
			return nil, fmt.Errorf("fleet not found: %q (available fleets: %s)", name, strings.Join(availableNames, ", "))
		}
		ids = append(ids, id)
	}
	return ids, nil
}

// fetchHostsHardCap is the safety ceiling on a single paginated fetch. Fleet
// inventories of 50k hosts × ~2KB per Endpoint = ~100MB in memory per call —
// without this cap a runaway filter (or a Fleet that ignores a filter and
// returns the full inventory) can OOM the MCP. Callers can tune via
// fetchHostsFromPathBounded.
// var (not const) so tests can temporarily lower the cap without having to
// generate 10k+ host fixtures.
var fetchHostsHardCap = 10000

// fetchHostsFromPath issues GETs against an arbitrary Fleet hosts-listing
// path (e.g. /api/v1/fleet/hosts?... or /api/v1/fleet/labels/:id/hosts?...)
// and decodes the {hosts: [...]} envelope. Paginates server-side via ?page=N
// until either the upstream returns a short page (last page) or the hard cap
// is reached. The path's existing per_page (if any) sets the page size; this
// function manages ?page= itself.
//
// ctx propagation: caller cancellation stops the fan-out between pages —
// long-running multi-page fetches (label intersection, CVE compose) honor
// MCP request cancellation rather than running every page to completion.
func (fc *FleetClient) fetchHostsFromPath(ctx context.Context, path string) ([]Endpoint, bool, error) {
	return fc.fetchHostsFromPathBounded(ctx, path, fetchHostsHardCap)
}

// fetchHostsFromPathBounded is the paginating worker behind fetchHostsFromPath.
// hardCap <= 0 falls back to fetchHostsHardCap. When the cap is hit we log a
// warning AND return truncated=true so callers (e.g. GetVulnerabilityImpact)
// can surface "result is incomplete" to operators instead of silently
// undercounting.
func (fc *FleetClient) fetchHostsFromPathBounded(ctx context.Context, path string, hardCap int) ([]Endpoint, bool, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if hardCap <= 0 {
		hardCap = fetchHostsHardCap
	}
	base, query, _ := strings.Cut(path, "?")
	params, err := url.ParseQuery(query)
	if err != nil {
		return nil, false, fmt.Errorf("failed to parse hosts path query: %w", err)
	}
	perPage := 500
	if v := params.Get("per_page"); v != "" {
		if n, perr := strconv.Atoi(v); perr == nil && n > 0 && n <= 1000 {
			perPage = n
		}
	}
	params.Set("per_page", strconv.Itoa(perPage))

	out := make([]Endpoint, 0, perPage)
	truncated := false
	for page := 0; ; page++ {
		// Honor caller cancellation between paginated requests.
		if err := ctx.Err(); err != nil {
			return nil, false, err
		}
		params.Set("page", strconv.Itoa(page))
		resp, err := fc.makeFleetRequest(ctx, "GET", base+"?"+params.Encode(), nil)
		if err != nil {
			return nil, false, fmt.Errorf("failed to fetch hosts: %w", err)
		}
		if resp.StatusCode != http.StatusOK {
			resp.Body.Close()
			return nil, false, fmt.Errorf("failed to fetch hosts: status %d", resp.StatusCode)
		}
		var result struct {
			Hosts []Endpoint `json:"hosts"`
		}
		decErr := json.NewDecoder(resp.Body).Decode(&result)
		resp.Body.Close()
		if decErr != nil {
			return nil, false, fmt.Errorf("failed to decode hosts response: %w", decErr)
		}
		// Hard cap enforcement — truncate the incoming page so out never
		// exceeds hardCap. Truncation is rare; when it happens the operator
		// should either tighten filters or raise the cap.
		remaining := hardCap - len(out)
		if remaining <= 0 {
			logrus.Warnf("fleet host fetch hit hard cap %d (path=%s) — result truncated; tighten filters or raise fetchHostsHardCap", hardCap, base)
			truncated = true
			break
		}
		if len(result.Hosts) > remaining {
			out = append(out, result.Hosts[:remaining]...)
			logrus.Warnf("fleet host fetch hit hard cap %d (path=%s) — result truncated; tighten filters or raise fetchHostsHardCap", hardCap, base)
			truncated = true
			break
		}
		out = append(out, result.Hosts...)
		// Last page — Fleet returned fewer than the requested page size.
		if len(result.Hosts) < perPage {
			break
		}
	}
	return out, truncated, nil
}

// resolvePlatformOrLabelToLabelID picks a single Fleet label_id from EITHER
// labelName or platform — used because Fleet's /hosts endpoint silently
// IGNORES both ?platform= and ?label_id= filter params, so the only way to
// scope by label or platform is to call /api/v1/fleet/labels/:id/hosts
// directly. labelName takes precedence when both are set.
//
// Returns (labelID, true, nil) when label resolution succeeds, (_, false, nil)
// when neither argument is set, or (_, false, err) on resolution failure.
func (fc *FleetClient) resolvePlatformOrLabelToLabelID(ctx context.Context, labelName, platform string) (uint, bool, error) {
	if labelName != "" {
		id, err := fc.resolveLabelName(ctx, labelName)
		if err != nil {
			return 0, false, err
		}
		return id, true, nil
	}
	if platform != "" {
		builtin := platformToBuiltinLabel(platform)
		if builtin == "" {
			return 0, false, fmt.Errorf("unsupported platform %q (use one of: macos, windows, linux, chromeos)", platform)
		}
		id, err := fc.resolveLabelName(ctx, builtin)
		if err != nil {
			return 0, false, fmt.Errorf("failed to resolve built-in label for platform %q: %w", platform, err)
		}
		return id, true, nil
	}
	return 0, false, nil
}

// GetEndpointsWithFilters retrieves endpoints from Fleet with optional
// server-side filters.
//
// IMPORTANT — Fleet API quirks this function works around:
//
//   - /api/v1/fleet/hosts SILENTLY IGNORES ?platform= and ?label_id= query
//     params (any value returns the unfiltered host list). To filter by
//     label or platform we must call /api/v1/fleet/labels/:label_id/hosts
//     instead — that endpoint actually scopes results.
//   - /api/v1/fleet/labels/:id/hosts in turn IGNORES ?policy_id= and
//     ?software_version_id= filters but DOES respect ?team_id= / ?status= /
//     ?query=. So when label/platform AND policy_id/policy_response are
//     combined, we fetch both sets and intersect by host ID.
//   - /api/v1/fleet/hosts respects ?team_id, ?status, ?query, ?policy_id,
//     ?policy_response, ?software_version_id — used for the no-label path
//     and as the policy side of the intersection.
//
// query is a free-text substring matched case-insensitively against
// hostname / hardware_serial / primary_ip / hardware_model / user inventory
// (username / email / IdP group). Empty to skip.
//
// labelName takes precedence over platform when both are set; platform is
// translated to its built-in label name (macOS, MS Windows, All Linux, etc.)
// and resolved to a label_id for the same routing.
//
// policyResponse without policyID is rejected here as a sanity check.
func (fc *FleetClient) GetEndpointsWithFilters(ctx context.Context, teamName, platform, status, query, labelName, policyID, policyResponse string, perPage int) ([]Endpoint, error) {
	if policyResponse != "" && policyID == "" {
		return nil, fmt.Errorf("policy_response is only valid when policy_id is also set")
	}
	if policyResponse != "" && policyResponse != "passing" && policyResponse != "failing" {
		return nil, fmt.Errorf("policy_response must be 'passing' or 'failing', got %q", policyResponse)
	}

	// Resolve team name → team ID once (used in every branch below).
	var teamIDStr string
	if teamName != "" {
		teamIDs, err := fc.resolveTeamNames(ctx, []string{teamName})
		if err != nil {
			return nil, fmt.Errorf("failed to resolve fleet: %w", err)
		}
		teamIDStr = fmt.Sprintf("%d", teamIDs[0])
	}

	// Decide whether label-based routing is needed.
	labelID, viaLabel, err := fc.resolvePlatformOrLabelToLabelID(ctx, labelName, platform)
	if err != nil {
		return nil, err
	}

	// Path 1: no label/platform — single /hosts call with all filters server-side.
	if !viaLabel {
		params := url.Values{}
		params.Set("populate_labels", "true")
		if perPage > 0 {
			params.Set("per_page", fmt.Sprintf("%d", perPage))
		}
		if teamIDStr != "" {
			params.Set("team_id", teamIDStr)
		}
		if status != "" {
			params.Set("status", status)
		}
		if q := strings.TrimSpace(query); q != "" {
			params.Set("query", q)
		}
		if policyID != "" {
			params.Set("policy_id", policyID)
		}
		if policyResponse != "" {
			params.Set("policy_response", policyResponse)
		}
		hosts, _, err := fc.fetchHostsFromPath(ctx, "/api/v1/fleet/hosts?"+params.Encode())
		if err != nil {
			return nil, err
		}
		if perPage > 0 && len(hosts) > perPage {
			hosts = hosts[:perPage]
		}
		return hosts, nil
	}

	// Path 2: label/platform routing — call /labels/:id/hosts with the
	// filters that endpoint respects. Use a generous per_page so that
	// downstream client-side intersection (if any) has the full label set.
	labelParams := url.Values{}
	labelParams.Set("populate_labels", "true")
	if teamIDStr != "" {
		labelParams.Set("team_id", teamIDStr)
	}
	if status != "" {
		labelParams.Set("status", status)
	}
	if q := strings.TrimSpace(query); q != "" {
		labelParams.Set("query", q)
	}
	// Always pull a wide page from the label endpoint — intersection may
	// reduce the count, and the label endpoint doesn't honor most filters.
	labelParams.Set("per_page", "500")
	labelHosts, _, err := fc.fetchHostsFromPath(ctx, fmt.Sprintf("/api/v1/fleet/labels/%d/hosts?%s", labelID, labelParams.Encode()))
	if err != nil {
		return nil, err
	}

	// No policy filter → cap, enrich, return.
	//
	// Why enrich: Fleet's /labels/:id/hosts silently ignores populate_labels=true
	// (verified empirically) — every host in labelHosts has Labels=nil. The
	// MCP contract is to return hosts with their Labels populated, so we
	// hydrate via per-host /api/v1/fleet/hosts/:id calls (concurrent, bounded).
	if policyID == "" {
		if perPage > 0 && len(labelHosts) > perPage {
			labelHosts = labelHosts[:perPage]
		}
		fc.enrichHostLabels(ctx, labelHosts)
		return labelHosts, nil
	}

	// Path 3: label + policy combo — fetch policy side via /hosts (which DOES
	// honor populate_labels), then intersect against the label-side ID set.
	//
	// Why iterate policyHosts (not labelHosts): /hosts populates Labels;
	// /labels/:id/hosts does not. Picking from policyHosts means the result
	// already has Labels — no per-host enrichment needed for Path 3.
	policyParams := url.Values{}
	policyParams.Set("policy_id", policyID)
	policyParams.Set("populate_labels", "true")
	if policyResponse != "" {
		policyParams.Set("policy_response", policyResponse)
	}
	if teamIDStr != "" {
		policyParams.Set("team_id", teamIDStr)
	}
	if status != "" {
		policyParams.Set("status", status)
	}
	if q := strings.TrimSpace(query); q != "" {
		policyParams.Set("query", q)
	}
	policyParams.Set("per_page", "500")
	policyHosts, _, err := fc.fetchHostsFromPath(ctx, "/api/v1/fleet/hosts?"+policyParams.Encode())
	if err != nil {
		return nil, fmt.Errorf("failed to fetch policy host set for intersection: %w", err)
	}

	labelIDs := make(map[uint]bool, len(labelHosts))
	for _, h := range labelHosts {
		labelIDs[h.ID] = true
	}

	intersected := make([]Endpoint, 0)
	for _, h := range policyHosts {
		if !labelIDs[h.ID] {
			continue
		}
		intersected = append(intersected, h)
		if perPage > 0 && len(intersected) >= perPage {
			break
		}
	}
	return intersected, nil
}

// enrichHostLabels populates each host's Labels field via per-host detail
// fetches when Labels is nil. Used after /labels/:id/hosts (which silently
// ignores populate_labels=true and leaves Labels unpopulated). Idempotent:
// hosts that already carry Labels are skipped, so callers can invoke
// liberally without re-fetching.
//
// Concurrency is bounded so a 200-host result doesn't fan out to 200
// in-flight Fleet API calls. ctx propagation means MCP-level cancellation
// stops in-flight enrichment promptly. Per-host failures are logged but do
// not abort the whole call — the caller still gets the original list, just
// with some hosts missing labels.
func (fc *FleetClient) enrichHostLabels(ctx context.Context, hosts []Endpoint) {
	const enrichConcurrency = 8
	sem := make(chan struct{}, enrichConcurrency)
	var wg sync.WaitGroup
	for i := range hosts {
		if hosts[i].Labels != nil {
			continue
		}
		if err := ctx.Err(); err != nil {
			return
		}
		wg.Add(1)
		sem <- struct{}{}
		go func(idx int) {
			defer wg.Done()
			defer func() { <-sem }()
			full, err := fc.GetHostByID(ctx, hosts[idx].ID)
			if err != nil {
				logrus.Warnf("enrichHostLabels: failed to fetch host %d: %v", hosts[idx].ID, err)
				return
			}
			hosts[idx].Labels = full.Labels
		}(i)
	}
	wg.Wait()
}

// GetPolicyCompliance retrieves policy compliance data
func (fc *FleetClient) GetPolicyCompliance(ctx context.Context, policyID string) (*PolicyCompliance, error) {
	endpoint := fmt.Sprintf("/api/v1/fleet/global/policies/%s", url.PathEscape(policyID))
	resp, err := fc.makeFleetRequest(ctx, "GET", endpoint, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get policy compliance: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get policy compliance: status code %d", resp.StatusCode)
	}

	// The response format for policies typically includes hosts_count, passing_host_count, failing_host_count
	var result struct {
		Policy struct {
			ID               uint   `json:"id"`
			Name             string `json:"name"`
			PassingHostCount int    `json:"passing_host_count"`
			FailingHostCount int    `json:"failing_host_count"`
		} `json:"policy"`
	}

	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode policy compliance response: %w", err)
	}

	total := result.Policy.PassingHostCount + result.Policy.FailingHostCount

	return &PolicyCompliance{
		PolicyID:     fmt.Sprint(result.Policy.ID),
		PolicyName:   result.Policy.Name,
		Total:        total,
		Compliant:    result.Policy.PassingHostCount,
		NonCompliant: result.Policy.FailingHostCount,
	}, nil
}

// GetTeamPolicyCompliance retrieves policy compliance scoped to a single fleet
// (team). Wraps GET /api/v1/fleet/teams/:team_id/policies/:policy_id and
// returns the same PolicyCompliance shape as the global variant so callers
// can treat both uniformly. Use this — not GetPolicyCompliance — when the
// caller knows the policy belongs to a specific fleet, or when global counts
// would be misleading because the policy is fleet-scoped.
func (fc *FleetClient) GetTeamPolicyCompliance(ctx context.Context, teamID, policyID string) (*PolicyCompliance, error) {
	endpoint := fmt.Sprintf("/api/v1/fleet/teams/%s/policies/%s", url.PathEscape(teamID), url.PathEscape(policyID))
	resp, err := fc.makeFleetRequest(ctx, "GET", endpoint, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to get team policy compliance: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("failed to get team policy compliance: status code %d", resp.StatusCode)
	}

	var result struct {
		Policy struct {
			ID               uint   `json:"id"`
			Name             string `json:"name"`
			PassingHostCount int    `json:"passing_host_count"`
			FailingHostCount int    `json:"failing_host_count"`
		} `json:"policy"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode team policy compliance response: %w", err)
	}

	total := result.Policy.PassingHostCount + result.Policy.FailingHostCount
	return &PolicyCompliance{
		PolicyID:     fmt.Sprint(result.Policy.ID),
		PolicyName:   result.Policy.Name,
		Total:        total,
		Compliant:    result.Policy.PassingHostCount,
		NonCompliant: result.Policy.FailingHostCount,
	}, nil
}

// GetVulnerabilityImpact retrieves vulnerability impact data.
//
// The aggregate count is derived from the SAME fan-out used by
// GetHostsForCVE (software titles → versions → /hosts?software_version_id=N,
// deduped by host ID) so the count returned here matches the host list
// returned by get_vulnerability_hosts byte-for-byte.
//
// Why not Fleet's /api/v1/fleet/vulnerabilities/:cve.hosts_count? That field
// is populated by Fleet's vuln-aggregation cron, which runs less frequently
// than software inventory. In practice the aggregate trails the software
// inventory by minutes-to-hours, so the two values disagree on freshly
// vulnerable hosts. Sharing the fan-out path keeps impact and host listing
// numerically consistent — the price is N+1 extra HTTP calls per CVE, which
// is fine because impact is a low-frequency operation.
func (fc *FleetClient) GetVulnerabilityImpact(ctx context.Context, cveID string) (*VulnerabilityImpact, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if strings.TrimSpace(cveID) == "" {
		return nil, fmt.Errorf("cve_id is required")
	}

	// Reuse GetHostsForCVE with no filters and no per-page cap so the count
	// is the full impacted set, not the truncated tool-response cap. The
	// per-version-id fan-out is still bounded by fetchHostsHardCap; if any
	// fan-out hit it, GetHostsForCVE returns truncated=true and we surface
	// it on the response so operators see "incomplete" rather than a
	// silent undercount.
	hosts, truncated, err := fc.GetHostsForCVE(ctx, cveID, "", "", "", "", "", 0)
	if err != nil {
		return nil, fmt.Errorf("failed to compute vulnerability impact: %w", err)
	}

	totalSystems := 0
	if count, err := fc.GetHostCount(ctx); err == nil {
		totalSystems = count
	}

	return &VulnerabilityImpact{
		CVEID:           cveID,
		TotalSystems:    totalSystems,
		ImpactedSystems: len(hosts),
		Truncated:       truncated,
	}, nil
}

// GetHostsForCVE returns the specific hosts impacted by a CVE, optionally
// narrowed by team / platform / status / query (substring) / label name.
//
// Fleet's /hosts endpoint silently IGNORES ?cve= so we can't filter hosts
// by CVE directly. This composes three steps server-side:
//
//  1. /api/v1/fleet/software/titles?vulnerable=true&query=CVE-X[&team_id=N]
//     → list of software titles affected by the CVE (in the team if scoped).
//  2. /api/v1/fleet/software/titles/:title_id[?team_id=N]
//     → version IDs of the title.
//  3. /api/v1/fleet/hosts?software_version_id=V[&team_id=N&status=...&query=...]
//     → the actual hosts with that vulnerable version. /hosts respects this
//     filter (unlike ?cve= or ?platform= or ?label_id=).
//
// platform / labelName trigger client-side post-filtering on the final host
// list using each host's populate_labels=true label array — Fleet's /hosts
// endpoint silently ignores these filter params, so we can't push them
// server-side and have to verify membership locally.
//
// Use this — NOT GetVulnerabilityImpact — when callers need the actual
// host list. GetVulnerabilityImpact only returns an aggregate count.
//
// The second return is `truncated` — true when any per-version-id host
// fetch hit fetchHostsHardCap, meaning the impacted-host set is incomplete.
// Callers that need an accurate count (e.g. GetVulnerabilityImpact) must
// surface this so operators see "10000+" rather than a silent undercount.
func (fc *FleetClient) GetHostsForCVE(ctx context.Context, cveID, teamName, platform, status, query, labelName string, perPage int) ([]Endpoint, bool, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if strings.TrimSpace(cveID) == "" {
		return nil, false, fmt.Errorf("cve_id is required")
	}

	// Resolve team once.
	var teamIDStr string
	if teamName != "" {
		teamIDs, err := fc.resolveTeamNames(ctx, []string{teamName})
		if err != nil {
			return nil, false, fmt.Errorf("failed to resolve fleet: %w", err)
		}
		teamIDStr = fmt.Sprintf("%d", teamIDs[0])
	}

	// Step 1: software titles affected by this CVE (optionally team-scoped).
	// Paginate so CVEs that hit many vulnerable titles (e.g. an OpenSSL bug
	// affecting dozens of bundled products) don't silently drop pages
	// beyond the first.
	const titlesPerPage = 100
	titleParams := url.Values{}
	titleParams.Set("vulnerable", "true")
	titleParams.Set("query", strings.TrimSpace(cveID))
	titleParams.Set("per_page", strconv.Itoa(titlesPerPage))
	if teamIDStr != "" {
		titleParams.Set("team_id", teamIDStr)
	}
	titleIDs := make([]uint, 0)
	for page := 0; ; page++ {
		// Honor caller cancellation between paginated requests.
		if err := ctx.Err(); err != nil {
			return nil, false, err
		}
		titleParams.Set("page", strconv.Itoa(page))
		titleResp, err := fc.makeFleetRequest(ctx, "GET", "/api/v1/fleet/software/titles?"+titleParams.Encode(), nil)
		if err != nil {
			return nil, false, fmt.Errorf("failed to get software titles for CVE: %w", err)
		}
		if titleResp.StatusCode == http.StatusNotFound {
			titleResp.Body.Close()
			return nil, false, fmt.Errorf("CVE not found: %s", cveID)
		}
		if titleResp.StatusCode != http.StatusOK {
			status := titleResp.StatusCode
			titleResp.Body.Close()
			return nil, false, fmt.Errorf("failed to get software titles for CVE: status %d", status)
		}
		var titlesResult struct {
			SoftwareTitles []struct {
				ID uint `json:"id"`
			} `json:"software_titles"`
		}
		decErr := json.NewDecoder(titleResp.Body).Decode(&titlesResult)
		titleResp.Body.Close()
		if decErr != nil {
			return nil, false, fmt.Errorf("failed to decode software titles response: %w", decErr)
		}
		for _, t := range titlesResult.SoftwareTitles {
			titleIDs = append(titleIDs, t.ID)
		}
		// Last page — Fleet returned fewer than the requested page size.
		if len(titlesResult.SoftwareTitles) < titlesPerPage {
			break
		}
	}
	if len(titleIDs) == 0 {
		return []Endpoint{}, false, nil
	}

	// Step 2: per title, fetch detail to get version IDs.
	versionIDs := make([]uint, 0)
	for _, tID := range titleIDs {
		detailURL := fmt.Sprintf("/api/v1/fleet/software/titles/%d", tID)
		if teamIDStr != "" {
			detailURL += "?team_id=" + teamIDStr
		}
		// Honor caller cancellation between fan-out iterations: a slow CVE with
		// many vulnerable titles can issue dozens of HTTP calls, so checking
		// ctx between each one means a cancelled MCP request stops promptly.
		if err := ctx.Err(); err != nil {
			return nil, false, err
		}
		detailResp, dErr := fc.makeFleetRequest(ctx, "GET", detailURL, nil)
		if dErr != nil {
			logrus.Warnf("failed to fetch software title %d detail: %v", tID, dErr)
			continue
		}
		if detailResp.StatusCode != http.StatusOK {
			detailResp.Body.Close()
			logrus.Warnf("failed to fetch software title %d detail: status %d", tID, detailResp.StatusCode)
			continue
		}
		var detailResult struct {
			SoftwareTitle struct {
				Versions []struct {
					ID uint `json:"id"`
				} `json:"versions"`
			} `json:"software_title"`
		}
		decErr := json.NewDecoder(detailResp.Body).Decode(&detailResult)
		detailResp.Body.Close()
		if decErr != nil {
			logrus.Warnf("failed to decode software title %d detail: %v", tID, decErr)
			continue
		}
		for _, v := range detailResult.SoftwareTitle.Versions {
			versionIDs = append(versionIDs, v.ID)
		}
	}
	if len(versionIDs) == 0 {
		return []Endpoint{}, false, nil
	}

	// Step 3: per version_id, fetch hosts with composing filters server-side.
	baseParams := url.Values{}
	baseParams.Set("populate_labels", "true")
	if teamIDStr != "" {
		baseParams.Set("team_id", teamIDStr)
	}
	if status != "" {
		baseParams.Set("status", status)
	}
	if q := strings.TrimSpace(query); q != "" {
		baseParams.Set("query", q)
	}
	// Generous per_page on each fan-out; we cap the merged result at perPage.
	baseParams.Set("per_page", "500")

	// Client-side platform / label post-filter is folded into the per-page loop
	// below. Fleet's /hosts endpoint silently ignores ?platform= and ?label_id=,
	// so we verify membership locally using each host's labels (populate_labels=true).
	needPostFilter := platform != "" || labelName != ""

	seen := make(map[uint]bool)
	hosts := make([]Endpoint, 0)
	truncated := false
	for _, vid := range versionIDs {
		// Short-circuit: if the caller asked for at most perPage hosts and we
		// already have enough qualifying hosts, skip the rest of the fan-out.
		// On a CVE affecting dozens of versions this avoids 10s of MB of
		// downloaded host pages we'd have just truncated away.
		if perPage > 0 && len(hosts) >= perPage {
			break
		}
		// Honor caller cancellation between version-id fan-outs.
		if err := ctx.Err(); err != nil {
			return nil, false, err
		}
		params := url.Values{}
		for k, v := range baseParams {
			params[k] = v
		}
		params.Set("software_version_id", fmt.Sprintf("%d", vid))
		page, pageTruncated, fErr := fc.fetchHostsFromPath(ctx, "/api/v1/fleet/hosts?"+params.Encode())
		if fErr != nil {
			logrus.Warnf("failed to fetch hosts for software_version_id=%d: %v", vid, fErr)
			continue
		}
		if pageTruncated {
			truncated = true
		}
		if needPostFilter {
			page = filterHostsByPlatformOrLabel(page, platform, labelName)
		}
		for _, h := range page {
			if seen[h.ID] {
				continue
			}
			seen[h.ID] = true
			hosts = append(hosts, h)
			if perPage > 0 && len(hosts) >= perPage {
				break
			}
		}
	}

	return hosts, truncated, nil
}

// filterHostsByPlatformOrLabel narrows a host list to those matching either
// a normalized platform string (e.g. "linux" matches ubuntu/rhel/debian/etc.
// per matchesPlatform) AND/OR a label name (case-insensitive exact match
// against any of the host's labels). Empty arguments are skipped — passing
// "" for both is a no-op.
func filterHostsByPlatformOrLabel(hosts []Endpoint, platform, labelName string) []Endpoint {
	target := normalizePlatform(platform)
	wantLabel := strings.ToLower(strings.TrimSpace(labelName))
	out := make([]Endpoint, 0, len(hosts))
	for _, h := range hosts {
		if platform != "" && !matchesPlatform(h.Platform, target) {
			continue
		}
		if wantLabel != "" {
			has := false
			for _, l := range h.Labels {
				if strings.ToLower(l.Name) == wantLabel {
					has = true
					break
				}
			}
			if !has {
				continue
			}
		}
		out = append(out, h)
	}
	return out
}

// CreateSavedQuery creates a new saved query in Fleet. When teamID is non-nil
// the query is scoped to that team (Fleet) — the resulting query appears
// under that team in the Fleet UI, inherits its RBAC, and is listed by
// per-team query enumeration. Nil teamID creates the query at the Global
// scope. The Fleet API treats the fields equivalently for SQL execution; the
// difference matters for ownership, listing, and authorization.
func (fc *FleetClient) CreateSavedQuery(ctx context.Context, name, description, sql, platform string, teamID *uint) (*Query, error) {
	endpoint := "/api/v1/fleet/reports"

	reqBody := CreateQueryRequest{
		Name:        name,
		Description: description,
		Query:       sql,
		Platform:    platform,
		TeamID:      teamID,
	}

	resp, err := fc.makeFleetRequest(ctx, "POST", endpoint, reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to create saved query: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated {
		bodyBytes, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("failed to create saved query: status code %d, body: %s", resp.StatusCode, string(bodyBytes))
	}

	var result struct {
		Query Query `json:"query"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, fmt.Errorf("failed to decode created query response: %w", err)
	}

	return &result.Query, nil
}

// RunLiveQuery executes a live query against the specified targets using Fleet's modern REST API.
// Uses targeted API calls per dimension to avoid fetching all hosts.
// For single hosts: uses per-host ad hoc endpoint (POST /api/v1/fleet/hosts/:id/query).
// For multiple hosts: creates a temp saved query → runs by ID → deletes it.
// LiveQueryTargetSpec captures every dimension that scopes a live query.
//
// Filter dimensions (Fleet / Platform / Label / Status / Query / PolicyID /
// PolicyResponse / CVEID) are AND-ed together — the result is the intersection
// of every non-empty dimension. This mirrors the routing in
// GetEndpointsWithFilters and GetHostsForCVE so live-query target resolution
// is consistent with what the host-listing tools return.
//
// Hostnames + HostIDs form an additive "named host" set. When both filter
// dimensions and explicit names are provided, the final target is the
// INTERSECTION (named hosts that also pass the filter) — this lets a caller
// say "run on these specific hosts but only if they're Linux Workstations".
//
// Legacy plural args (Labels, Platforms, Fleets) are accepted for backward
// compatibility and processed as a per-dimension union of the FIRST item
// from each list — multi-label / multi-platform / multi-team intersection is
// not supported by Fleet's label/team endpoints in a single round trip and
// the union-then-intersect pattern was the source of the original
// "everyone gets queried" scope-bloat bug. Callers should prefer the
// singular Fleet / Platform / Label fields for new code.
type LiveQueryTargetSpec struct {
	Fleet          string
	Platform       string
	Label          string
	Status         string
	Query          string
	PolicyID       string
	PolicyResponse string
	CVEID          string
	Hostnames      []string
	HostIDs        []uint

	// Legacy / deprecated — first item only.
	LegacyFleets    []string
	LegacyPlatforms []string
	LegacyLabels    []string
}

// ResolveLiveQueryTargets returns the exact host set that would be targeted
// by a live query given the spec. Used by both prepare_live_query (preview)
// and run_live_query (execution) so the two stay in lockstep — what the
// user previews is exactly what gets queried.
func (fc *FleetClient) ResolveLiveQueryTargets(ctx context.Context, spec LiveQueryTargetSpec) ([]Endpoint, error) {
	// Apply legacy fallbacks (first item from each plural).
	if spec.Fleet == "" && len(spec.LegacyFleets) > 0 {
		spec.Fleet = strings.TrimSpace(spec.LegacyFleets[0])
		if len(spec.LegacyFleets) > 1 {
			logrus.Warnf("multi-fleet targeting not supported in single round-trip; using first: %q", spec.Fleet)
		}
	}
	if spec.Platform == "" && len(spec.LegacyPlatforms) > 0 {
		spec.Platform = strings.TrimSpace(spec.LegacyPlatforms[0])
		if len(spec.LegacyPlatforms) > 1 {
			logrus.Warnf("multi-platform targeting not supported in single round-trip; using first: %q", spec.Platform)
		}
	}
	if spec.Label == "" && len(spec.LegacyLabels) > 0 {
		spec.Label = strings.TrimSpace(spec.LegacyLabels[0])
		if len(spec.LegacyLabels) > 1 {
			logrus.Warnf("multi-label targeting not supported in single round-trip; using first: %q", spec.Label)
		}
	}

	hasFilter := spec.Fleet != "" || spec.Platform != "" || spec.Label != "" ||
		spec.Status != "" || spec.Query != "" || spec.PolicyID != "" ||
		spec.PolicyResponse != "" || spec.CVEID != ""
	hasExplicit := len(spec.Hostnames) > 0 || len(spec.HostIDs) > 0
	if !hasFilter && !hasExplicit {
		return nil, fmt.Errorf("at least one target dimension required (fleet, platform, label, status, query, policy_id, cve_id, hostnames, or host_ids)")
	}

	const livePerPage = 500

	// Build filter set.
	var filterSet []Endpoint
	if hasFilter {
		switch {
		case spec.CVEID != "":
			cveHosts, _, err := fc.GetHostsForCVE(ctx, spec.CVEID, spec.Fleet, spec.Platform, spec.Status, spec.Query, spec.Label, livePerPage)
			if err != nil {
				return nil, fmt.Errorf("CVE filter resolution failed: %w", err)
			}
			filterSet = cveHosts
			// Also intersect with policy filter when both are set.
			if spec.PolicyID != "" {
				policyHosts, pErr := fc.GetEndpointsWithFilters(ctx, spec.Fleet, "", spec.Status, spec.Query, "", spec.PolicyID, spec.PolicyResponse, livePerPage)
				if pErr != nil {
					return nil, fmt.Errorf("policy filter resolution failed: %w", pErr)
				}
				filterSet = intersectHostsByID(filterSet, policyHosts)
			}
		default:
			endpointHosts, err := fc.GetEndpointsWithFilters(ctx, spec.Fleet, spec.Platform, spec.Status, spec.Query, spec.Label, spec.PolicyID, spec.PolicyResponse, livePerPage)
			if err != nil {
				return nil, fmt.Errorf("endpoint filter resolution failed: %w", err)
			}
			filterSet = endpointHosts
		}
	}

	// Build explicit set from host_ids and hostnames.
	explicitSet := make([]Endpoint, 0)
	seen := make(map[uint]bool)
	for _, id := range spec.HostIDs {
		if id == 0 {
			continue
		}
		h, err := fc.GetHostByID(ctx, id)
		if err != nil {
			return nil, fmt.Errorf("host_id %d not found: %w", id, err)
		}
		if seen[h.ID] {
			continue
		}
		seen[h.ID] = true
		explicitSet = append(explicitSet, *h)
	}
	for _, raw := range spec.Hostnames {
		name := strings.TrimSpace(raw)
		if name == "" {
			continue
		}
		// Query-first to detect hostname collisions before silently picking.
		candidates, qErr := fc.GetEndpointsWithFilters(ctx, "", "", "", name, "", "", "", 50)
		var resolved *Endpoint
		// Fleet's query parameter is a substring match across hostname,
		// computer_name, hardware_serial, primary_ip, etc. Only accept a
		// singleton when it actually matches on a hostname-like field —
		// otherwise serial/IP/user substring hits would be silently treated
		// as hostname matches.
		if qErr == nil && len(candidates) == 1 {
			cand := candidates[0]
			if full, fErr := fc.GetHostByID(ctx, cand.ID); fErr == nil {
				cand = *full
			}
			if endpointMatchesHostname(cand, name) {
				resolved = &cand
			}
		}
		if resolved == nil {
			if qErr == nil && len(candidates) > 1 {
				return nil, fmt.Errorf("hostname %q matches %d hosts — disambiguate with host_ids (Fleet's substring search does not cover display_name; pass numeric IDs to be unambiguous)", name, len(candidates))
			}
			// Fall back to /hosts/identifier/:id for UUID and computer_name matches.
			h, idErr := fc.GetHostByIdentifier(ctx, name)
			if idErr != nil {
				return nil, fmt.Errorf("hostname %q not found: %w", name, idErr)
			}
			resolved = h
		}
		if seen[resolved.ID] {
			continue
		}
		seen[resolved.ID] = true
		explicitSet = append(explicitSet, *resolved)
	}

	// Combine.
	switch {
	case hasFilter && hasExplicit:
		return intersectHostsByID(explicitSet, filterSet), nil
	case hasFilter:
		return filterSet, nil
	default:
		return explicitSet, nil
	}
}

// intersectHostsByID returns hosts present in both lists, preserving the
// order of the first list. Used for label+policy and CVE+policy intersection.
func intersectHostsByID(a, b []Endpoint) []Endpoint {
	bIDs := make(map[uint]bool, len(b))
	for _, h := range b {
		bIDs[h.ID] = true
	}
	out := make([]Endpoint, 0)
	seen := make(map[uint]bool)
	for _, h := range a {
		if !bIDs[h.ID] || seen[h.ID] {
			continue
		}
		seen[h.ID] = true
		out = append(out, h)
	}
	return out
}

func (fc *FleetClient) RunLiveQuery(ctx context.Context, sql string, hostnames, labels, platforms, teams []string) (*LiveQueryResult, error) {
	// Legacy entry point — preserved so existing callers keep working.
	// New code should use RunLiveQueryWithSpec for full filter dimensions.
	spec := LiveQueryTargetSpec{
		Hostnames:       hostnames,
		LegacyLabels:    labels,
		LegacyPlatforms: platforms,
		LegacyFleets:    teams,
	}
	return fc.RunLiveQueryWithSpec(ctx, sql, spec)
}

// RunLiveQueryWithSpec resolves the spec to an exact target host list using
// the same intersection semantics as ResolveLiveQueryTargets, then dispatches
// to single-host or multi-host osquery distribution.
//
// When spec.Fleet (or the legacy spec.LegacyFleets[0]) is set, the team is
// resolved here and the team_id is threaded through runMultiHostQuery so the
// transient saved query is created under that team instead of Global. The
// host targeting itself is already team-scoped via ResolveLiveQueryTargets;
// this additionally aligns the saved-query ownership / RBAC with the team.
func (fc *FleetClient) RunLiveQueryWithSpec(ctx context.Context, sql string, spec LiveQueryTargetSpec) (*LiveQueryResult, error) {
	targets, err := fc.ResolveLiveQueryTargets(ctx, spec)
	if err != nil {
		return nil, fmt.Errorf("failed to resolve target hosts: %w", err)
	}
	if len(targets) == 0 {
		return nil, fmt.Errorf("no matching hosts found for the provided targets")
	}

	teamID, err := fc.resolveLiveQueryTeamID(ctx, spec)
	if err != nil {
		return nil, err
	}

	hostIDs := make([]uint, 0, len(targets))
	nameByID := make(map[uint]Endpoint, len(targets))
	for _, t := range targets {
		hostIDs = append(hostIDs, t.ID)
		nameByID[t.ID] = t
	}

	if len(hostIDs) == 1 {
		// Ad-hoc single-host path uses POST /hosts/:id/query directly — no
		// saved query is created so team scoping does not apply.
		return fc.runAdHocSingleHost(ctx, hostIDs[0], sql, nameByID)
	}
	return fc.runMultiHostQuery(ctx, hostIDs, sql, nameByID, teamID)
}

// resolveLiveQueryTeamID translates spec.Fleet (with LegacyFleets fallback)
// into a *uint team_id suitable for CreateSavedQuery / CreateQueryRequest.
// Returns (nil, nil) when no team is requested — that's the Global scope.
func (fc *FleetClient) resolveLiveQueryTeamID(ctx context.Context, spec LiveQueryTargetSpec) (*uint, error) {
	teamName := strings.TrimSpace(spec.Fleet)
	if teamName == "" && len(spec.LegacyFleets) > 0 {
		teamName = strings.TrimSpace(spec.LegacyFleets[0])
	}
	if teamName == "" {
		return nil, nil
	}
	ids, err := fc.resolveTeamNames(ctx, []string{teamName})
	if err != nil {
		return nil, fmt.Errorf("failed to resolve fleet %q for query scoping: %w", teamName, err)
	}
	if len(ids) == 0 {
		return nil, fmt.Errorf("fleet %q resolved to no team IDs", teamName)
	}
	id := ids[0]
	return &id, nil
}

// runAdHocSingleHost uses POST /api/v1/fleet/hosts/:id/query (Fleet 4.43+ synchronous REST).
func (fc *FleetClient) runAdHocSingleHost(ctx context.Context, hostID uint, sql string, endpointByID map[uint]Endpoint) (*LiveQueryResult, error) {
	endpointPath := fmt.Sprintf("/api/v1/fleet/hosts/%d/query", hostID)
	resp, err := fc.makeFleetRequest(ctx, "POST", endpointPath, AdHocQueryRequest{Query: sql})
	if err != nil {
		return nil, fmt.Errorf("ad hoc query failed: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("ad hoc query failed with status %d: %s", resp.StatusCode, string(body))
	}

	var adHoc AdHocQueryResponse
	if err := json.NewDecoder(resp.Body).Decode(&adHoc); err != nil {
		return nil, fmt.Errorf("failed to decode ad hoc query response: %w", err)
	}

	hostName := ""
	if ep, ok := endpointByID[hostID]; ok {
		hostName = ep.DisplayName
		if hostName == "" {
			hostName = ep.Name
		}
	}

	resultRow := map[string]interface{}{
		"host_id":   hostID,
		"host_name": hostName,
		"status":    adHoc.Status,
		"rows":      adHoc.Rows,
	}
	if adHoc.Error != nil {
		resultRow["error"] = *adHoc.Error
	}

	respondedCount := 0
	if adHoc.Status == "online" {
		respondedCount = 1
	}
	return &LiveQueryResult{
		TargetedHostCount:  1,
		RespondedHostCount: respondedCount,
		Results:            []map[string]interface{}{resultRow},
	}, nil
}

// runMultiHostQuery creates a temporary saved query, runs it by ID, then deletes it.
// Uses POST /api/v1/fleet/queries/:id/run (Fleet 4.43+ synchronous REST).
//
// The temp query name pairs a millisecond timestamp with 8 random bytes — the
// timestamp keeps lexical order useful for log scans, the random suffix makes
// concurrent invocations from the same MCP process collision-proof. If the
// DELETE in the deferred cleanup fails (network blip, Fleet 5xx, MCP killed),
// the leftover is logged at error level so an operator can run the startup
// sweeper or clean it up by hand. SweepLeftoverTempQueries() also removes any
// such residue at next MCP boot.
func (fc *FleetClient) runMultiHostQuery(ctx context.Context, hostIDs []uint, sql string, endpointByID map[uint]Endpoint, teamID *uint) (*LiveQueryResult, error) {
	tempName := fmt.Sprintf("%s%d-%s", tempQueryNamePrefix, time.Now().UnixMilli(), randomHexSuffix(8))
	// teamID propagates from the caller's spec.Fleet — when set, the temp
	// saved query lives under that team (Fleet) instead of Global, so RBAC,
	// listings, and audit trail all reflect the intended scope.
	savedQuery, err := fc.CreateSavedQuery(ctx, tempName, "Temporary MCP live query", sql, "", teamID)
	if err != nil {
		return nil, fmt.Errorf("failed to create temporary query: %w", err)
	}
	defer func() {
		// Detach from the request ctx — if the caller cancelled (MCP client
		// hung up, request timeout), we still want to clean up the temp
		// query rather than wait for the next startup sweep. Bound the
		// detached call with a short timeout so a wedged Fleet doesn't pin
		// the goroutine forever.
		cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
		defer cancel()
		delEndpoint := fmt.Sprintf("/api/v1/fleet/reports/id/%d", savedQuery.ID)
		r, delErr := fc.makeFleetRequest(cleanupCtx, "DELETE", delEndpoint, nil)
		if r != nil {
			r.Body.Close()
		}
		if delErr != nil {
			logrus.Errorf("failed to delete temp query %s (id=%d): %v — will be swept on next startup", tempName, savedQuery.ID, delErr)
		} else if r != nil && r.StatusCode != http.StatusOK && r.StatusCode != http.StatusNoContent {
			logrus.Errorf("temp query DELETE returned status %d for %s (id=%d) — will be swept on next startup", r.StatusCode, tempName, savedQuery.ID)
		}
	}()

	logrus.Infof("Created temp query ID=%d, running against %d hosts", savedQuery.ID, len(hostIDs))

	runEndpoint := fmt.Sprintf("/api/v1/fleet/reports/%d/run", savedQuery.ID)
	resp, err := fc.makeFleetRequest(ctx, "POST", runEndpoint, MultiQueryRunRequest{HostIDs: hostIDs})
	if err != nil {
		return nil, fmt.Errorf("failed to run live query: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		body, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("live query run failed with status %d: %s", resp.StatusCode, string(body))
	}

	var runResp MultiQueryRunResponse
	if err := json.NewDecoder(resp.Body).Decode(&runResp); err != nil {
		return nil, fmt.Errorf("failed to decode live query run response: %w", err)
	}

	var enriched []map[string]interface{}
	for _, r := range runResp.Results {
		row := map[string]interface{}{
			"host_id": r.HostID,
			"rows":    r.Rows,
		}
		if ep, ok := endpointByID[r.HostID]; ok {
			name := ep.DisplayName
			if name == "" {
				name = ep.Name
			}
			row["host_name"] = name
		}
		if r.Error != nil {
			row["error"] = *r.Error
		}
		enriched = append(enriched, row)
	}

	return &LiveQueryResult{
		TargetedHostCount:  runResp.TargetedHostCount,
		RespondedHostCount: runResp.RespondedHostCount,
		Results:            enriched,
	}, nil
}

// isTempQueryName reports whether name marks a transient saved query
// created by runMultiHostQuery. Tolerates the "[<team>] " prefix that
// GetQueries prepends to team-scoped queries so team-scoped temp queries
// are detected alongside global ones.
func isTempQueryName(name string) bool {
	if strings.HasPrefix(name, "[") {
		if idx := strings.Index(name, "] "); idx > 0 {
			name = name[idx+2:]
		}
	}
	return strings.HasPrefix(name, tempQueryNamePrefix)
}

// endpointMatchesHostname reports whether ep's hostname-like fields (Name,
// ComputerName, DisplayName) equal name case-insensitively. Used to verify
// a singleton substring hit from Fleet's /hosts?query= actually matched on
// a hostname rather than a serial / IP / user field.
func endpointMatchesHostname(ep Endpoint, name string) bool {
	return strings.EqualFold(ep.Name, name) ||
		strings.EqualFold(ep.ComputerName, name) ||
		strings.EqualFold(ep.DisplayName, name)
}

// SweepLeftoverTempQueries deletes any saved queries whose name begins with
// tempQueryNamePrefix. Called once at MCP startup to clean up residue from
// previous runMultiHostQuery invocations whose deferred DELETE failed (process
// killed mid-run, Fleet 5xx, network partition). Best-effort: errors are
// logged but do not block startup.
func (fc *FleetClient) SweepLeftoverTempQueries(ctx context.Context) {
	queries, err := fc.GetQueries(ctx)
	if err != nil {
		logrus.Warnf("temp-query sweep: failed to list queries: %v", err)
		return
	}
	swept := 0
	for _, q := range queries {
		if !isTempQueryName(q.Name) {
			continue
		}
		delEndpoint := fmt.Sprintf("/api/v1/fleet/reports/id/%d", q.ID)
		r, err := fc.makeFleetRequest(ctx, "DELETE", delEndpoint, nil)
		if r != nil {
			r.Body.Close()
		}
		if err != nil {
			logrus.Warnf("temp-query sweep: failed to delete %s (id=%d): %v", q.Name, q.ID, err)
			continue
		}
		swept++
	}
	if swept > 0 {
		logrus.Infof("temp-query sweep: deleted %d leftover %s* queries", swept, tempQueryNamePrefix)
	}
}

// makeFleetRequest builds and executes a Fleet API request bound to ctx.
//
// ctx propagation: when the MCP caller cancels the request (client disconnect,
// deadline exceeded, transport-level cancellation), the in-flight Fleet HTTP
// call cancels too. Callers that do NOT have a useful ctx may pass
// context.Background(); long-running fan-outs (e.g. CVE compose, live-query
// resolve) should pass the MCP handler's ctx directly.
//
// The previous implementation logged "%s %s" with the full request path —
// that path includes user-supplied identifiers (hostnames, serials, IdP emails
// via ?query=, CVE IDs, etc.) which leaks PII to debug logs and any log
// shipper. We now log only the method and the path before the query string
// so the route shape is observable without exposing identifiers.
func (fc *FleetClient) makeFleetRequest(ctx context.Context, method, endpoint string, body interface{}) (*http.Response, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	url := fmt.Sprintf("%s%s", fc.baseURL, endpoint)

	var reqBody io.Reader
	if body != nil {
		jsonBody, err := json.Marshal(body)
		if err != nil {
			return nil, fmt.Errorf("failed to marshal request body: %w", err)
		}
		reqBody = bytes.NewReader(jsonBody)
	}

	req, err := http.NewRequestWithContext(ctx, method, url, reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("Authorization", "Bearer "+fc.apiKey)

	// PII-safe debug: log the route shape (path before any query string)
	// rather than the full endpoint, which can carry hostnames / emails /
	// CVE IDs as filter values.
	pathOnly, _, _ := strings.Cut(endpoint, "?")
	logrus.Debugf("%s %s", method, pathOnly)

	resp, err := fc.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("failed to make request to Fleet API: %w", err)
	}

	return resp, nil
}