mirror of
https://github.com/mudler/LocalAI
synced 2026-04-21 13:27:21 +00:00
When installing a backend with a custom OCI URI in distributed mode, the URI was captured in ManagementOp.ExternalURI by the HTTP handler but never forwarded to workers. BackendInstallRequest had no URI field, so workers fell through to the gallery lookup and failed with "no backend found with name <custom-name>". Add URI/Name/Alias fields to BackendInstallRequest and thread them from ManagementOp through DistributedBackendManager.InstallBackend() and the RemoteUnloaderAdapter. On the worker side, route to InstallExternalBackend when URI is set instead of InstallBackendFromGallery. Update all remaining InstallBackend call sites (UpgradeBackend, reconciler pending-op drain, router auto-install) to pass empty strings for the new params. Assisted-by: Claude Code:claude-sonnet-4-6 Signed-off-by: Russell Sim <rsl@simopolis.xyz>
429 lines
15 KiB
Go
429 lines
15 KiB
Go
package nodes
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/services/advisorylock"
|
|
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
|
|
"github.com/mudler/xlog"
|
|
"github.com/nats-io/nats.go"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// ModelProber checks whether a model's backend process is still reachable.
|
|
// Defaulted to a gRPC health probe but overridable for tests so we don't
|
|
// need to stand up a real server. Returning false without an error means the
|
|
// process is reachable but unhealthy (same as a timeout for our purposes).
|
|
type ModelProber interface {
|
|
IsAlive(ctx context.Context, address string) bool
|
|
}
|
|
|
|
// grpcModelProber does a 1s HealthCheck on the model's stored gRPC address.
|
|
type grpcModelProber struct{ token string }
|
|
|
|
func (g grpcModelProber) IsAlive(ctx context.Context, address string) bool {
|
|
client := grpcclient.NewClientWithToken(address, false, nil, false, g.token)
|
|
probeCtx, cancel := context.WithTimeout(ctx, 1*time.Second)
|
|
defer cancel()
|
|
ok, _ := client.HealthCheck(probeCtx)
|
|
return ok
|
|
}
|
|
|
|
// ReplicaReconciler periodically ensures model replica counts match their
|
|
// scheduling configs. It scales up replicas when below MinReplicas or when
|
|
// all replicas are busy (up to MaxReplicas), and scales down idle replicas
|
|
// above MinReplicas.
|
|
//
|
|
// Alongside replica scaling it runs two state-reconciliation passes — draining
|
|
// the pending_backend_ops queue and probing loaded models' gRPC addresses to
|
|
// orphan ghosts. Both passes are wrapped in the KeyStateReconciler advisory
|
|
// lock so N frontends don't stampede.
|
|
//
|
|
// Only processes models with auto-scaling enabled (MinReplicas > 0 or MaxReplicas > 0).
|
|
type ReplicaReconciler struct {
|
|
registry *NodeRegistry
|
|
scheduler ModelScheduler // interface for scheduling new models
|
|
unloader NodeCommandSender
|
|
adapter *RemoteUnloaderAdapter // NATS sender for pending-op drain
|
|
prober ModelProber // health probe for model gRPC addrs
|
|
db *gorm.DB
|
|
interval time.Duration
|
|
scaleDownDelay time.Duration
|
|
// probeStaleAfter: only probe node_models rows older than this so we
|
|
// don't hammer every worker every tick for models we just heard from.
|
|
probeStaleAfter time.Duration
|
|
}
|
|
|
|
// ModelScheduler abstracts the scheduling logic needed by the reconciler.
|
|
// SmartRouter implements this interface.
|
|
type ModelScheduler interface {
|
|
// ScheduleAndLoadModel picks a node (optionally from candidateNodeIDs),
|
|
// installs the backend, and loads the model. Returns the node used.
|
|
ScheduleAndLoadModel(ctx context.Context, modelName string, candidateNodeIDs []string) (*BackendNode, error)
|
|
}
|
|
|
|
// ReplicaReconcilerOptions holds configuration for creating a ReplicaReconciler.
|
|
type ReplicaReconcilerOptions struct {
|
|
Registry *NodeRegistry
|
|
Scheduler ModelScheduler
|
|
Unloader NodeCommandSender
|
|
// Adapter is the NATS sender used to retry pending backend ops. When nil,
|
|
// the state-reconciler pending-drain pass is a no-op (single-node mode).
|
|
Adapter *RemoteUnloaderAdapter
|
|
// RegistrationToken is used by the default gRPC prober when probing model
|
|
// addresses. Matches the worker's token so HealthCheck auth succeeds.
|
|
RegistrationToken string
|
|
// Prober overrides the default gRPC health probe (used by tests).
|
|
Prober ModelProber
|
|
DB *gorm.DB
|
|
Interval time.Duration // default 30s
|
|
ScaleDownDelay time.Duration // default 5m
|
|
ProbeStaleAfter time.Duration // default 2m
|
|
}
|
|
|
|
// NewReplicaReconciler creates a new ReplicaReconciler.
|
|
func NewReplicaReconciler(opts ReplicaReconcilerOptions) *ReplicaReconciler {
|
|
interval := opts.Interval
|
|
if interval == 0 {
|
|
interval = 30 * time.Second
|
|
}
|
|
scaleDownDelay := opts.ScaleDownDelay
|
|
if scaleDownDelay == 0 {
|
|
scaleDownDelay = 5 * time.Minute
|
|
}
|
|
probeStaleAfter := opts.ProbeStaleAfter
|
|
if probeStaleAfter == 0 {
|
|
probeStaleAfter = 2 * time.Minute
|
|
}
|
|
prober := opts.Prober
|
|
if prober == nil {
|
|
prober = grpcModelProber{token: opts.RegistrationToken}
|
|
}
|
|
return &ReplicaReconciler{
|
|
registry: opts.Registry,
|
|
scheduler: opts.Scheduler,
|
|
unloader: opts.Unloader,
|
|
adapter: opts.Adapter,
|
|
prober: prober,
|
|
db: opts.DB,
|
|
interval: interval,
|
|
scaleDownDelay: scaleDownDelay,
|
|
probeStaleAfter: probeStaleAfter,
|
|
}
|
|
}
|
|
|
|
// Run starts the reconciliation loop. It blocks until ctx is cancelled.
|
|
func (rc *ReplicaReconciler) Run(ctx context.Context) {
|
|
ticker := time.NewTicker(rc.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
rc.reconcileOnce(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// reconcileOnce performs a single reconciliation pass. Replica work and
|
|
// state-reconciliation work run under *different* advisory locks so multiple
|
|
// frontends can share load across passes, and one long-running pass doesn't
|
|
// block the other forever if a frontend wedges.
|
|
func (rc *ReplicaReconciler) reconcileOnce(ctx context.Context) {
|
|
if rc.db != nil {
|
|
replicaKey := advisorylock.KeyFromString("replica-reconciler")
|
|
_ = advisorylock.WithLockCtx(ctx, rc.db, replicaKey, func() error {
|
|
rc.reconcile(ctx)
|
|
return nil
|
|
})
|
|
// Try, don't block: if another frontend is already running the state
|
|
// pass, this tick is a no-op. Matches the health monitor pattern.
|
|
_, _ = advisorylock.TryWithLockCtx(ctx, rc.db, advisorylock.KeyStateReconciler, func() error {
|
|
rc.reconcileState(ctx)
|
|
return nil
|
|
})
|
|
} else {
|
|
rc.reconcile(ctx)
|
|
rc.reconcileState(ctx)
|
|
}
|
|
}
|
|
|
|
// reconcileState runs the state-reconciliation passes: drain pending backend
|
|
// ops for freshly-healthy nodes, then probe model gRPC addresses to orphan
|
|
// ghosts. Both passes are best-effort: a failure on one node doesn't stop
|
|
// the rest.
|
|
func (rc *ReplicaReconciler) reconcileState(ctx context.Context) {
|
|
if rc.adapter != nil {
|
|
rc.drainPendingBackendOps(ctx)
|
|
}
|
|
rc.probeLoadedModels(ctx)
|
|
}
|
|
|
|
// drainPendingBackendOps retries queued backend ops whose next_retry_at has
|
|
// passed on nodes that are currently healthy. On success the row is deleted;
|
|
// on failure attempts++ and next_retry_at moves out via exponential backoff.
|
|
func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
|
|
ops, err := rc.registry.ListDuePendingBackendOps(ctx)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list pending backend ops", "error", err)
|
|
return
|
|
}
|
|
if len(ops) == 0 {
|
|
return
|
|
}
|
|
xlog.Debug("Reconciler: draining pending backend ops", "count", len(ops))
|
|
|
|
for _, op := range ops {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
var applyErr error
|
|
switch op.Op {
|
|
case OpBackendDelete:
|
|
_, applyErr = rc.adapter.DeleteBackend(op.NodeID, op.Backend)
|
|
case OpBackendInstall, OpBackendUpgrade:
|
|
reply, err := rc.adapter.InstallBackend(op.NodeID, op.Backend, "", string(op.Galleries), "", "", "")
|
|
if err != nil {
|
|
applyErr = err
|
|
} else if !reply.Success {
|
|
applyErr = fmt.Errorf("%s failed: %s", op.Op, reply.Error)
|
|
}
|
|
default:
|
|
xlog.Warn("Reconciler: unknown pending op", "op", op.Op, "id", op.ID)
|
|
continue
|
|
}
|
|
|
|
if applyErr == nil {
|
|
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
|
xlog.Warn("Reconciler: failed to delete drained op row", "id", op.ID, "error", err)
|
|
} else {
|
|
xlog.Info("Reconciler: pending backend op applied",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// ErrNoResponders means the node has no active NATS subscription for
|
|
// this subject. Either its connection dropped, or it's the wrong
|
|
// node type entirely. Mark unhealthy so the health monitor's
|
|
// heartbeat-only pass doesn't immediately flip it back — and so
|
|
// ListDuePendingBackendOps (which filters by status=healthy) stops
|
|
// picking the row until the node genuinely recovers.
|
|
if errors.Is(applyErr, nats.ErrNoResponders) {
|
|
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
|
|
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
|
|
}
|
|
|
|
// Dead-letter cap: after maxAttempts the row is the reconciler
|
|
// equivalent of a poison message. Delete it loudly so the queue
|
|
// doesn't churn NATS every tick forever — operators can re-issue
|
|
// the op from the UI if they still want it applied.
|
|
if op.Attempts+1 >= maxPendingBackendOpAttempts {
|
|
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
|
|
"attempts", op.Attempts+1, "last_error", applyErr)
|
|
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
|
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
|
|
}
|
|
continue
|
|
}
|
|
|
|
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
|
|
xlog.Warn("Reconciler: pending backend op retry failed",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
|
|
}
|
|
}
|
|
|
|
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
|
|
// failing row before dead-lettering it. Ten attempts at exponential backoff
|
|
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
|
|
// worker restart or network blip. Poisoned rows beyond that are almost
|
|
// certainly structural (wrong node type, non-existent gallery entry) and no
|
|
// amount of further retrying will help.
|
|
const maxPendingBackendOpAttempts = 10
|
|
|
|
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
|
|
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
|
|
// we remove the row so ghosts don't linger. Only probes rows older than
|
|
// probeStaleAfter so we don't hammer every worker every tick for models we
|
|
// just heard from.
|
|
func (rc *ReplicaReconciler) probeLoadedModels(ctx context.Context) {
|
|
var stale []NodeModel
|
|
cutoff := time.Now().Add(-rc.probeStaleAfter)
|
|
err := rc.registry.db.WithContext(ctx).
|
|
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
|
Where("node_models.state = ? AND backend_nodes.status = ? AND node_models.updated_at < ? AND node_models.address != ''",
|
|
"loaded", StatusHealthy, cutoff).
|
|
Find(&stale).Error
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list loaded models for probe", "error", err)
|
|
return
|
|
}
|
|
for _, m := range stale {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
if rc.prober.IsAlive(ctx, m.Address) {
|
|
// Bump updated_at so we don't probe this row again immediately.
|
|
_ = rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
|
|
Where("id = ?", m.ID).Update("updated_at", time.Now()).Error
|
|
continue
|
|
}
|
|
if err := rc.registry.RemoveNodeModel(ctx, m.NodeID, m.ModelName); err != nil {
|
|
xlog.Warn("Reconciler: failed to remove unreachable model", "node", m.NodeID, "model", m.ModelName, "error", err)
|
|
continue
|
|
}
|
|
xlog.Warn("Reconciler: model unreachable, removed from registry",
|
|
"node", m.NodeID, "model", m.ModelName, "address", m.Address)
|
|
}
|
|
}
|
|
|
|
func (rc *ReplicaReconciler) reconcile(ctx context.Context) {
|
|
configs, err := rc.registry.ListAutoScalingConfigs(ctx)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list auto-scaling configs", "error", err)
|
|
return
|
|
}
|
|
|
|
for _, cfg := range configs {
|
|
if err := ctx.Err(); err != nil {
|
|
return // context cancelled
|
|
}
|
|
rc.reconcileModel(ctx, cfg)
|
|
}
|
|
}
|
|
|
|
func (rc *ReplicaReconciler) reconcileModel(ctx context.Context, cfg ModelSchedulingConfig) {
|
|
current, err := rc.registry.CountLoadedReplicas(ctx, cfg.ModelName)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to count replicas", "model", cfg.ModelName, "error", err)
|
|
return
|
|
}
|
|
|
|
// 1. Ensure minimum replicas
|
|
if cfg.MinReplicas > 0 && int(current) < cfg.MinReplicas {
|
|
needed := cfg.MinReplicas - int(current)
|
|
xlog.Info("Reconciler: scaling up to meet minimum", "model", cfg.ModelName,
|
|
"current", current, "min", cfg.MinReplicas, "adding", needed)
|
|
rc.scaleUp(ctx, cfg, needed)
|
|
return
|
|
}
|
|
|
|
// 2. Auto-scale up if all replicas are busy
|
|
if current > 0 && (cfg.MaxReplicas == 0 || int(current) < cfg.MaxReplicas) {
|
|
if rc.allReplicasBusy(ctx, cfg.ModelName) {
|
|
xlog.Info("Reconciler: all replicas busy, scaling up", "model", cfg.ModelName,
|
|
"current", current)
|
|
rc.scaleUp(ctx, cfg, 1)
|
|
}
|
|
}
|
|
|
|
// 3. Scale down idle replicas above minimum
|
|
floor := cfg.MinReplicas
|
|
if floor < 1 {
|
|
floor = 1
|
|
}
|
|
if int(current) > floor {
|
|
rc.scaleDownIdle(ctx, cfg, int(current), floor)
|
|
}
|
|
}
|
|
|
|
// scaleUp schedules additional replicas of the model.
|
|
func (rc *ReplicaReconciler) scaleUp(ctx context.Context, cfg ModelSchedulingConfig, count int) {
|
|
if rc.scheduler == nil {
|
|
xlog.Warn("Reconciler: no scheduler available, cannot scale up")
|
|
return
|
|
}
|
|
|
|
// Determine candidate nodes from selector
|
|
var candidateNodeIDs []string
|
|
if cfg.NodeSelector != "" {
|
|
selector := parseSelector(cfg.NodeSelector)
|
|
if len(selector) > 0 {
|
|
candidates, err := rc.registry.FindNodesBySelector(ctx, selector)
|
|
if err != nil || len(candidates) == 0 {
|
|
xlog.Warn("Reconciler: no nodes match selector", "model", cfg.ModelName,
|
|
"selector", cfg.NodeSelector)
|
|
return
|
|
}
|
|
candidateNodeIDs = make([]string, len(candidates))
|
|
for i, n := range candidates {
|
|
candidateNodeIDs[i] = n.ID
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := 0; i < count; i++ {
|
|
node, err := rc.scheduler.ScheduleAndLoadModel(ctx, cfg.ModelName, candidateNodeIDs)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to scale up replica", "model", cfg.ModelName,
|
|
"attempt", i+1, "error", err)
|
|
return // stop trying on first failure
|
|
}
|
|
xlog.Info("Reconciler: scaled up replica", "model", cfg.ModelName, "node", node.Name)
|
|
}
|
|
}
|
|
|
|
// scaleDownIdle removes idle replicas above the floor.
|
|
func (rc *ReplicaReconciler) scaleDownIdle(ctx context.Context, cfg ModelSchedulingConfig, current, floor int) {
|
|
if rc.unloader == nil {
|
|
return
|
|
}
|
|
|
|
// Find idle replicas that have been unused for longer than scaleDownDelay
|
|
cutoff := time.Now().Add(-rc.scaleDownDelay)
|
|
var idleModels []NodeModel
|
|
rc.registry.db.WithContext(ctx).
|
|
Where("model_name = ? AND state = ? AND in_flight = 0 AND last_used < ?",
|
|
cfg.ModelName, "loaded", cutoff).
|
|
Order("last_used ASC").
|
|
Find(&idleModels)
|
|
|
|
toRemove := current - floor
|
|
removed := 0
|
|
for _, nm := range idleModels {
|
|
if removed >= toRemove {
|
|
break
|
|
}
|
|
// Remove from registry
|
|
if err := rc.registry.RemoveNodeModel(ctx, nm.NodeID, nm.ModelName); err != nil {
|
|
xlog.Warn("Reconciler: failed to remove model record", "error", err)
|
|
continue
|
|
}
|
|
// Unload from worker
|
|
if err := rc.unloader.UnloadModelOnNode(nm.NodeID, nm.ModelName); err != nil {
|
|
xlog.Warn("Reconciler: unload failed (model already removed from registry)", "error", err)
|
|
}
|
|
xlog.Info("Reconciler: scaled down idle replica", "model", cfg.ModelName, "node", nm.NodeID)
|
|
removed++
|
|
}
|
|
}
|
|
|
|
// allReplicasBusy returns true if all loaded replicas of a model have in-flight requests.
|
|
func (rc *ReplicaReconciler) allReplicasBusy(ctx context.Context, modelName string) bool {
|
|
var idleCount int64
|
|
rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
|
|
Where("model_name = ? AND state = ? AND in_flight = 0", modelName, "loaded").
|
|
Count(&idleCount)
|
|
return idleCount == 0
|
|
}
|
|
|
|
// parseSelector decodes a JSON node selector string into a map.
|
|
func parseSelector(selectorJSON string) map[string]string {
|
|
if selectorJSON == "" {
|
|
return nil
|
|
}
|
|
var selector map[string]string
|
|
if err := json.Unmarshal([]byte(selectorJSON), &selector); err != nil {
|
|
xlog.Warn("Failed to parse node selector", "selector", selectorJSON, "error", err)
|
|
return nil
|
|
}
|
|
return selector
|
|
}
|