Add dev infrastructure and docs for Prometheus monitoring (#33)

- Set up a simple example of Prometheus monitoring in the development
  docker-compose.yml.
- Add documentation for configuring Prometheus.
This commit is contained in:
Zach Wasserman 2020-11-12 19:06:56 -08:00 committed by GitHub
parent 138329c371
commit 6cbd10965c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 61 additions and 14 deletions

View file

@ -45,5 +45,12 @@ services:
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./tools/app/prometheus.yml:/etc/prometheus/prometheus.yml
volumes:
mysql-persistent-volume:

View file

@ -11,7 +11,7 @@ Kolide Fleet is an infrastructure instrumentation application which has it's own
## Installing Fleet and its dependencies
The Fleet server has a few dependencies. To learn more about installing the Fleet server and it's dependencies, see the [Installing Fleet](./installing-fleet.md) guide.
The Fleet server has a few infrastructure dependencies. To learn more about installing the Fleet server and it's dependencies, see the [Installing Fleet](./installing-fleet.md) guide.
## Managing a Fleet server
@ -19,6 +19,8 @@ We're prepared a brief guide to help you manage and maintain your Fleet server.
For more information, you can also read the [Configuring The Fleet Binary](./configuring-the-fleet-binary.md) guide for information on how to configure and customize Fleet for your organization.
Once the Fleet server is installed and configured, take a look at the [Monitoring & Alerting](./monitoring-alerting.md) documentation.
## Working with osquery logs
Fleet allows users to schedule queries, curate packs, and generate a lot of osquery logs. For more information on how you can access these logs as well as examples on what you can do with them, see the [Working With Osquery Logs](./working-with-osquery-logs.md) documentation.

View file

@ -78,9 +78,7 @@ fleet prepare db \
## How do I monitor a Fleet server?
Fleet provides a `/healthz` endpoint. If you query it with `curl` it will return an HTTP Status code. `200 OK` means everything is alright. `500 Internal Server Error` means Fleet is having trouble communicating with MySQL or Redis. Check the Fleet logs for additional details.
The `/metrics` endpoint exposes data ready to be ingested by Prometheus.
Fleet provides standard interfaces for monitoring and alerting. See the [Monitoring & Alerting](./monitoring-alerting.md) documentation for details.
## Why is the "Add User" button disabled?

View file

@ -0,0 +1,32 @@
# Monitoring Fleet
## Health Checks
Fleet exposes a basic health check at the `/healthz` endpoint. This is the interface to use for simple monitoring and load-balancer health checks.
The `/healthz` endpoint will return an `HTTP 200` status if the server is running and has healthy connections to MySQL and Redis. If there are any problems, the endpoint will return an `HTTP 500` status.
## Metrics
Fleet exposes server metrics in a format compatible with [Prometheus](https://prometheus.io/). A simple example Prometheus configuration is available in [tools/app/prometheus.yml](/tools/app/prometheus.yml).
Prometheus can be configured to use a wide range of service discovery mechanisms within AWS, GCP, Azure, Kubernetes, and more. See the Prometheus [configuration documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) for more information on configuring the
### Alerting
Prometheus has built-in support for alerting through [Alertmanager](https://prometheus.io/docs/alerting/latest/overview/).
Consider building alerts for
- Changes from expected levels of host enrollment
- Increased latency on HTTP endpoints
- Increased error levels on HTTP endpoints
```
TODO (Seeking Contributors)
Add example alerting configurations
```
### Graphing
Prometheus provides basic graphing capabilities, and integrates tightly with [Grafana](https://prometheus.io/docs/visualization/grafana/) for sophisticated visualizations.

View file

@ -5,12 +5,12 @@ import (
"net/http"
"strings"
"github.com/fleetdm/fleet/server/config"
"github.com/fleetdm/fleet/server/kolide"
"github.com/go-kit/kit/endpoint"
kitlog "github.com/go-kit/kit/log"
kithttp "github.com/go-kit/kit/transport/http"
"github.com/gorilla/mux"
"github.com/fleetdm/fleet/server/config"
"github.com/fleetdm/fleet/server/kolide"
"github.com/prometheus/client_golang/prometheus"
)
@ -102,7 +102,7 @@ type KolideEndpoints struct {
StatusResultStore endpoint.Endpoint
StatusLiveQuery endpoint.Endpoint
ListCarves endpoint.Endpoint
GetCarve endpoint.Endpoint
GetCarve endpoint.Endpoint
GetCarveBlock endpoint.Endpoint
}
@ -196,7 +196,7 @@ func MakeKolideServerEndpoints(svc kolide.Service, jwtKey, urlPrefix string) Kol
GetCertificate: authenticatedUser(jwtKey, svc, makeCertificateEndpoint(svc)),
ChangeEmail: authenticatedUser(jwtKey, svc, makeChangeEmailEndpoint(svc)),
ListCarves: authenticatedUser(jwtKey, svc, makeListCarvesEndpoint(svc)),
GetCarve: authenticatedUser(jwtKey, svc, makeGetCarveEndpoint(svc)),
GetCarve: authenticatedUser(jwtKey, svc, makeGetCarveEndpoint(svc)),
GetCarveBlock: authenticatedUser(jwtKey, svc, makeGetCarveBlockEndpoint(svc)),
// Authenticated status endpoints
@ -304,7 +304,7 @@ type kolideHandlers struct {
StatusResultStore http.Handler
StatusLiveQuery http.Handler
ListCarves http.Handler
GetCarve http.Handler
GetCarve http.Handler
GetCarveBlock http.Handler
}
@ -399,8 +399,8 @@ func makeKolideKitHandlers(e KolideEndpoints, opts []kithttp.ServerOption) *koli
StatusResultStore: newServer(e.StatusResultStore, decodeNoParamsRequest),
StatusLiveQuery: newServer(e.StatusLiveQuery, decodeNoParamsRequest),
ListCarves: newServer(e.ListCarves, decodeListCarvesRequest),
GetCarve: newServer(e.GetCarve, decodeGetCarveRequest),
GetCarveBlock: newServer(e.GetCarveBlock, decodeGetCarveBlockRequest),
GetCarve: newServer(e.GetCarve, decodeGetCarveRequest),
GetCarveBlock: newServer(e.GetCarveBlock, decodeGetCarveBlockRequest),
}
}
@ -423,12 +423,13 @@ func MakeHandler(svc kolide.Service, config config.KolideConfig, logger kitlog.L
r := mux.NewRouter()
attachKolideAPIRoutes(r, kolideHandlers)
addMetrics(r)
// Results endpoint is handled different due to websockets use
r.PathPrefix("/api/v1/kolide/results/").
Handler(makeStreamDistributedQueryCampaignResultsHandler(svc, config.Auth.JwtKey, logger)).
Name("distributed_query_results")
addMetrics(r)
return r
}
@ -439,7 +440,6 @@ func addMetrics(r *mux.Router) {
return nil
}
r.Walk(walkFn)
}
func attachKolideAPIRoutes(r *mux.Router, h *kolideHandlers) {

8
tools/app/prometheus.yml Normal file
View file

@ -0,0 +1,8 @@
scrape_configs:
- job_name: fleet
scheme: https
scrape_interval: 5s
static_configs:
- targets: ['host.docker.internal:8080']
tls_config:
insecure_skip_verify: true