2026-03-29 22:47:27 +00:00
|
|
|
package application
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"encoding/json"
|
|
|
|
|
"fmt"
|
|
|
|
|
"io"
|
|
|
|
|
"strings"
|
|
|
|
|
"sync"
|
2026-03-31 06:28:56 +00:00
|
|
|
"time"
|
2026-03-29 22:47:27 +00:00
|
|
|
|
|
|
|
|
"github.com/google/uuid"
|
|
|
|
|
"github.com/mudler/LocalAI/core/config"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/agents"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/distributed"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/jobs"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/messaging"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/nodes"
|
|
|
|
|
"github.com/mudler/LocalAI/core/services/storage"
|
|
|
|
|
"github.com/mudler/LocalAI/pkg/sanitize"
|
|
|
|
|
"github.com/mudler/xlog"
|
|
|
|
|
"gorm.io/gorm"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// DistributedServices holds all services initialized for distributed mode.
|
|
|
|
|
type DistributedServices struct {
|
|
|
|
|
Nats *messaging.Client
|
|
|
|
|
Store storage.ObjectStore
|
|
|
|
|
Registry *nodes.NodeRegistry
|
|
|
|
|
Router *nodes.SmartRouter
|
|
|
|
|
Health *nodes.HealthMonitor
|
2026-03-31 06:28:56 +00:00
|
|
|
Reconciler *nodes.ReplicaReconciler
|
2026-03-29 22:47:27 +00:00
|
|
|
JobStore *jobs.JobStore
|
|
|
|
|
Dispatcher *jobs.Dispatcher
|
|
|
|
|
AgentStore *agents.AgentStore
|
|
|
|
|
AgentBridge *agents.EventBridge
|
|
|
|
|
DistStores *distributed.Stores
|
|
|
|
|
FileMgr *storage.FileManager
|
|
|
|
|
FileStager nodes.FileStager
|
|
|
|
|
ModelAdapter *nodes.ModelRouterAdapter
|
|
|
|
|
Unloader *nodes.RemoteUnloaderAdapter
|
|
|
|
|
|
|
|
|
|
shutdownOnce sync.Once
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Shutdown stops all distributed services in reverse initialization order.
|
|
|
|
|
// It is safe to call on a nil receiver and is idempotent (uses sync.Once).
|
|
|
|
|
func (ds *DistributedServices) Shutdown() {
|
|
|
|
|
if ds == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
ds.shutdownOnce.Do(func() {
|
|
|
|
|
if ds.Health != nil {
|
|
|
|
|
ds.Health.Stop()
|
|
|
|
|
}
|
|
|
|
|
if ds.Dispatcher != nil {
|
|
|
|
|
ds.Dispatcher.Stop()
|
|
|
|
|
}
|
|
|
|
|
if closer, ok := ds.Store.(io.Closer); ok {
|
|
|
|
|
closer.Close()
|
|
|
|
|
}
|
|
|
|
|
// AgentBridge has no Close method — its NATS subscriptions are cleaned up
|
|
|
|
|
// when the NATS client is closed below.
|
|
|
|
|
if ds.Nats != nil {
|
|
|
|
|
ds.Nats.Close()
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Distributed services shut down")
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// initDistributed validates distributed mode prerequisites and initializes
|
|
|
|
|
// NATS, object storage, node registry, and instance identity.
|
|
|
|
|
// Returns nil if distributed mode is not enabled.
|
feat(concurrency-groups): per-model exclusive groups for backend loading (#9662)
* feat(concurrency-groups): per-model exclusive groups for backend loading
Adds `concurrency_groups: [...]` to model YAML configs. Two models that share
a group cannot be loaded concurrently on the same node — loading one evicts
the others, reusing the existing pinned/busy/retry policy from LRU eviction.
Layered design:
- Watchdog (pkg/model): per-node correctness floor — on every Load(), evict
any loaded model that shares a group with the requested one. Pinned skips
surface NeedMore so the loader retries (and ultimately logs a clear
warning), instead of silently allowing the rule to be violated.
- Distributed scheduler (core/services/nodes): soft anti-affinity hint —
scheduleNewModel prefers nodes that don't already host a same-group
model, falling back to eviction only if every candidate has a conflict.
Composes with NodeSelector at the same point in the candidate pipeline.
Per-node, not cluster-wide: VRAM is a node-local resource, and two heavy
models running on different nodes is fine. The ConfigLoader is wired into
SmartRouter via a small ConcurrencyConflictResolver interface so the nodes
package keeps a narrow surface on core/config.
Refactors the inner LRU eviction body into a shared collectEvictionsLocked
helper and the loader retry loop into retryEnforce(fn, maxRetries, interval),
so both LRU and group enforcement share busy/pinned/retry semantics.
Closes #9659.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(watchdog): sync pinned + concurrency_groups at startup
The startup-time watchdog setup lives in initializeWatchdog (startup.go),
not in startWatchdog (watchdog.go). The latter is only invoked from the
runtime-settings RestartWatchdog path. As a result, neither
SyncPinnedModelsToWatchdog nor SyncModelGroupsToWatchdog ran at boot,
so `pinned: true` and `concurrency_groups: [...]` only became effective
after a settings-driven watchdog restart.
Fix by adding both sync calls to initializeWatchdog. Confirmed end-to-end:
loading model A in group "heavy", then C with no group (coexists),
then B in group "heavy" now correctly evicts A and leaves [B, C].
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(test): satisfy errcheck on new os.Remove in concurrency_groups spec
CI lint runs new-from-merge-base, so the existing pre-existing
`defer os.Remove(tmp.Name())` lines are baseline-grandfathered but the
one introduced by the concurrency_groups YAML round-trip test is held
to errcheck. Wrap the remove in a closure that discards the error.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-05 06:42:50 +00:00
|
|
|
// configLoader is used by the SmartRouter to compute concurrency-group
|
|
|
|
|
// anti-affinity at placement time (#9659); it may be nil in tests.
|
|
|
|
|
func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoader *config.ModelConfigLoader) (*DistributedServices, error) {
|
2026-03-29 22:47:27 +00:00
|
|
|
if !cfg.Distributed.Enabled {
|
|
|
|
|
return nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
xlog.Info("Distributed mode enabled — validating prerequisites")
|
|
|
|
|
|
|
|
|
|
// Validate distributed config (NATS URL, S3 credential pairing, durations, etc.)
|
|
|
|
|
if err := cfg.Distributed.Validate(); err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Validate PostgreSQL is configured (auth DB must be PostgreSQL for distributed mode)
|
|
|
|
|
if !cfg.Auth.Enabled {
|
|
|
|
|
return nil, fmt.Errorf("distributed mode requires authentication to be enabled (--auth / LOCALAI_AUTH=true)")
|
|
|
|
|
}
|
|
|
|
|
if !isPostgresURL(cfg.Auth.DatabaseURL) {
|
|
|
|
|
return nil, fmt.Errorf("distributed mode requires PostgreSQL for auth database (got %q)", sanitize.URL(cfg.Auth.DatabaseURL))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Generate instance ID if not set
|
|
|
|
|
if cfg.Distributed.InstanceID == "" {
|
|
|
|
|
cfg.Distributed.InstanceID = uuid.New().String()
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Distributed instance", "id", cfg.Distributed.InstanceID)
|
|
|
|
|
|
|
|
|
|
// Connect to NATS
|
|
|
|
|
natsClient, err := messaging.New(cfg.Distributed.NatsURL)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("connecting to NATS: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Connected to NATS", "url", sanitize.URL(cfg.Distributed.NatsURL))
|
|
|
|
|
|
|
|
|
|
// Ensure NATS is closed if any subsequent initialization step fails.
|
|
|
|
|
success := false
|
|
|
|
|
defer func() {
|
|
|
|
|
if !success {
|
|
|
|
|
natsClient.Close()
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
// Initialize object storage
|
|
|
|
|
var store storage.ObjectStore
|
|
|
|
|
if cfg.Distributed.StorageURL != "" {
|
|
|
|
|
if cfg.Distributed.StorageBucket == "" {
|
|
|
|
|
return nil, fmt.Errorf("distributed storage bucket must be set when storage URL is configured")
|
|
|
|
|
}
|
|
|
|
|
s3Store, err := storage.NewS3Store(context.Background(), storage.S3Config{
|
|
|
|
|
Endpoint: cfg.Distributed.StorageURL,
|
|
|
|
|
Region: cfg.Distributed.StorageRegion,
|
|
|
|
|
Bucket: cfg.Distributed.StorageBucket,
|
|
|
|
|
AccessKeyID: cfg.Distributed.StorageAccessKey,
|
|
|
|
|
SecretAccessKey: cfg.Distributed.StorageSecretKey,
|
|
|
|
|
ForcePathStyle: true, // required for MinIO
|
|
|
|
|
})
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing S3 storage: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Object storage initialized (S3)", "endpoint", cfg.Distributed.StorageURL, "bucket", cfg.Distributed.StorageBucket)
|
|
|
|
|
store = s3Store
|
|
|
|
|
} else {
|
|
|
|
|
// Fallback to filesystem storage in distributed mode (useful for single-node testing)
|
|
|
|
|
fsStore, err := storage.NewFilesystemStore(cfg.DataPath + "/objectstore")
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing filesystem storage: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Object storage initialized (filesystem fallback)", "path", cfg.DataPath+"/objectstore")
|
|
|
|
|
store = fsStore
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize node registry (requires the auth DB which is PostgreSQL)
|
|
|
|
|
if authDB == nil {
|
|
|
|
|
return nil, fmt.Errorf("distributed mode requires auth database to be initialized first")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
registry, err := nodes.NewNodeRegistry(authDB)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing node registry: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Node registry initialized")
|
|
|
|
|
|
|
|
|
|
// Collect SmartRouter option values; the router itself is created after all
|
|
|
|
|
// dependencies (including FileStager and Unloader) are ready.
|
|
|
|
|
var routerAuthToken string
|
|
|
|
|
if cfg.Distributed.RegistrationToken != "" {
|
|
|
|
|
routerAuthToken = cfg.Distributed.RegistrationToken
|
|
|
|
|
}
|
|
|
|
|
var routerGalleriesJSON string
|
|
|
|
|
if galleriesJSON, err := json.Marshal(cfg.BackendGalleries); err == nil {
|
|
|
|
|
routerGalleriesJSON = string(galleriesJSON)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
healthMon := nodes.NewHealthMonitor(registry, authDB,
|
|
|
|
|
cfg.Distributed.HealthCheckIntervalOrDefault(),
|
|
|
|
|
cfg.Distributed.StaleNodeThresholdOrDefault(),
|
|
|
|
|
routerAuthToken,
|
fix(distributed): cascade-clean stale node_models rows + filter routing by healthy status (#9754)
* fix(distributed): cascade-clean stale node_models on drain and filter routing by healthy status
Stale node_models rows (state="loaded") were surviving past the healthy
state of their owning node, causing /embeddings (and other inference
paths) to dispatch to a backend whose process was gone or drained. The
downstream symptom in a live cluster was pgvector rejecting inserts
with "vector cannot have more than 16000 dimensions (SQLSTATE 54000)"
because the misbehaving backend silently returned a malformed
(oversized) tensor; the Models page showed the model as "running"
without an associated node, like a stale entry, even though the node
was no longer visible in the Nodes view.
Two changes here, plus a third in a follow-up commit:
- MarkDraining now cascade-deletes node_models rows for the affected
node, mirroring MarkOffline. Drains are explicit operator actions —
the box has been intentionally taken out of rotation — so clearing
the rows stops the Models UI from misreporting and prevents the
routing layer from picking those rows if scheduling logic is ever
relaxed. In-flight requests already hold their gRPC client through
Route() and finish normally; the only observable effect is a
non-fatal IncrementInFlight warning, acceptable for a drain.
MarkUnhealthy is deliberately left status-only: it fires from
managers_distributed / reconciler on a single nats.ErrNoResponders
with no retry, so a transient NATS hiccup must not nuke every loaded
model and force a full reload on recovery.
- FindAndLockNodeWithModel's inner JOIN now filters on
backend_nodes.status = healthy in addition to node_models.state =
loaded. The previous version relied on the second node-fetch step to
reject non-healthy nodes, but a concurrent reader could still pick
the same stale row in the same window. Belt-and-braces.
- DistributedConfig.PerModelHealthCheck renamed to
DisablePerModelHealthCheck and inverted at the call site so
per-model gRPC probing is on by default. The probe (now made
consecutive-miss aware in a follow-up commit) independently health-
checks each model's gRPC address and removes stale node_models rows
when the backend has crashed even though the worker's node-level
heartbeat is still arriving.
Migration: the field had no CLI flag, env var binding, or YAML key
in tree (only the bare struct field), so there is no user-facing
migration. Anything constructing DistributedConfig in code needs to
drop the assignment (default now does the right thing) or invert it.
Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(distributed): require consecutive misses before per-model probe removes a row
The per-model gRPC probe used to remove a node_models row on a single
failed health check. With the per-model probe now on by default, that
made any 5-second gRPC blip (network jitter, a long-running request
hogging the worker's gRPC server thread, brief GC pause) trigger a
full reload of the affected model — too eager for production.
Require perModelMissThreshold (3) consecutive failed probes before
removal. At the default 15s tick a model must be unreachable for ~45s
before reap; a single successful probe in between resets the streak.
Per-(node, model, replica) state tracked under a mutex on the monitor.
If the removal call itself fails, the miss counter is left in place
so the next tick retries rather than starting the streak over.
Tests:
- removes stale model via per-model health check after consecutive
failures (replaces the single-shot expectation)
- preserves model row when an intermittent failure is followed by a
success (covers the reset-on-success path and verifies the counter
reset by failing twice more without crossing threshold)
- newTestHealthMonitor initializes the misses map so direct-construct
test helpers don't nil-map-panic in the probe path
Assisted-by: Claude:claude-opus-4-7 go-vet go-test golangci-lint
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-13 19:57:50 +00:00
|
|
|
!cfg.Distributed.DisablePerModelHealthCheck,
|
2026-03-29 22:47:27 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Initialize job store
|
|
|
|
|
jobStore, err := jobs.NewJobStore(authDB)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing job store: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Distributed job store initialized")
|
|
|
|
|
|
|
|
|
|
// Initialize job dispatcher
|
|
|
|
|
dispatcher := jobs.NewDispatcher(jobStore, natsClient, authDB, cfg.Distributed.InstanceID, cfg.Distributed.JobWorkerConcurrency)
|
|
|
|
|
|
|
|
|
|
// Initialize agent store
|
|
|
|
|
agentStore, err := agents.NewAgentStore(authDB)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing agent store: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("Distributed agent store initialized")
|
|
|
|
|
|
|
|
|
|
// Initialize agent event bridge
|
|
|
|
|
agentBridge := agents.NewEventBridge(natsClient, agentStore, cfg.Distributed.InstanceID)
|
|
|
|
|
|
|
|
|
|
// Start observable persister — captures observable_update events from workers
|
|
|
|
|
// (which have no DB access) and persists them to PostgreSQL.
|
|
|
|
|
if err := agentBridge.StartObservablePersister(); err != nil {
|
|
|
|
|
xlog.Warn("Failed to start observable persister", "error", err)
|
|
|
|
|
} else {
|
|
|
|
|
xlog.Info("Observable persister started")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize Phase 4 stores (MCP, Gallery, FineTune, Skills)
|
|
|
|
|
distStores, err := distributed.InitStores(authDB)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing distributed stores: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize file manager with local cache
|
|
|
|
|
cacheDir := cfg.DataPath + "/cache"
|
|
|
|
|
fileMgr, err := storage.NewFileManager(store, cacheDir)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("initializing file manager: %w", err)
|
|
|
|
|
}
|
|
|
|
|
xlog.Info("File manager initialized", "cacheDir", cacheDir)
|
|
|
|
|
|
|
|
|
|
// Create FileStager for distributed file transfer
|
|
|
|
|
var fileStager nodes.FileStager
|
|
|
|
|
if cfg.Distributed.StorageURL != "" {
|
|
|
|
|
fileStager = nodes.NewS3NATSFileStager(fileMgr, natsClient)
|
|
|
|
|
xlog.Info("File stager initialized (S3+NATS)")
|
|
|
|
|
} else {
|
|
|
|
|
fileStager = nodes.NewHTTPFileStager(func(nodeID string) (string, error) {
|
|
|
|
|
node, err := registry.Get(context.Background(), nodeID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return "", err
|
|
|
|
|
}
|
|
|
|
|
if node.HTTPAddress == "" {
|
|
|
|
|
return "", fmt.Errorf("node %s has no HTTP address for file transfer", nodeID)
|
|
|
|
|
}
|
|
|
|
|
return node.HTTPAddress, nil
|
|
|
|
|
}, cfg.Distributed.RegistrationToken)
|
|
|
|
|
xlog.Info("File stager initialized (HTTP direct transfer)")
|
|
|
|
|
}
|
|
|
|
|
// Create RemoteUnloaderAdapter — needed by SmartRouter and startup.go
|
|
|
|
|
remoteUnloader := nodes.NewRemoteUnloaderAdapter(registry, natsClient)
|
|
|
|
|
|
|
|
|
|
// All dependencies ready — build SmartRouter with all options at once
|
feat(concurrency-groups): per-model exclusive groups for backend loading (#9662)
* feat(concurrency-groups): per-model exclusive groups for backend loading
Adds `concurrency_groups: [...]` to model YAML configs. Two models that share
a group cannot be loaded concurrently on the same node — loading one evicts
the others, reusing the existing pinned/busy/retry policy from LRU eviction.
Layered design:
- Watchdog (pkg/model): per-node correctness floor — on every Load(), evict
any loaded model that shares a group with the requested one. Pinned skips
surface NeedMore so the loader retries (and ultimately logs a clear
warning), instead of silently allowing the rule to be violated.
- Distributed scheduler (core/services/nodes): soft anti-affinity hint —
scheduleNewModel prefers nodes that don't already host a same-group
model, falling back to eviction only if every candidate has a conflict.
Composes with NodeSelector at the same point in the candidate pipeline.
Per-node, not cluster-wide: VRAM is a node-local resource, and two heavy
models running on different nodes is fine. The ConfigLoader is wired into
SmartRouter via a small ConcurrencyConflictResolver interface so the nodes
package keeps a narrow surface on core/config.
Refactors the inner LRU eviction body into a shared collectEvictionsLocked
helper and the loader retry loop into retryEnforce(fn, maxRetries, interval),
so both LRU and group enforcement share busy/pinned/retry semantics.
Closes #9659.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(watchdog): sync pinned + concurrency_groups at startup
The startup-time watchdog setup lives in initializeWatchdog (startup.go),
not in startWatchdog (watchdog.go). The latter is only invoked from the
runtime-settings RestartWatchdog path. As a result, neither
SyncPinnedModelsToWatchdog nor SyncModelGroupsToWatchdog ran at boot,
so `pinned: true` and `concurrency_groups: [...]` only became effective
after a settings-driven watchdog restart.
Fix by adding both sync calls to initializeWatchdog. Confirmed end-to-end:
loading model A in group "heavy", then C with no group (coexists),
then B in group "heavy" now correctly evicts A and leaves [B, C].
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(test): satisfy errcheck on new os.Remove in concurrency_groups spec
CI lint runs new-from-merge-base, so the existing pre-existing
`defer os.Remove(tmp.Name())` lines are baseline-grandfathered but the
one introduced by the concurrency_groups YAML round-trip test is held
to errcheck. Wrap the remove in a closure that discards the error.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-05 06:42:50 +00:00
|
|
|
var conflictResolver nodes.ConcurrencyConflictResolver
|
|
|
|
|
if configLoader != nil {
|
|
|
|
|
conflictResolver = configLoader
|
|
|
|
|
}
|
2026-03-29 22:47:27 +00:00
|
|
|
router := nodes.NewSmartRouter(registry, nodes.SmartRouterOptions{
|
feat(concurrency-groups): per-model exclusive groups for backend loading (#9662)
* feat(concurrency-groups): per-model exclusive groups for backend loading
Adds `concurrency_groups: [...]` to model YAML configs. Two models that share
a group cannot be loaded concurrently on the same node — loading one evicts
the others, reusing the existing pinned/busy/retry policy from LRU eviction.
Layered design:
- Watchdog (pkg/model): per-node correctness floor — on every Load(), evict
any loaded model that shares a group with the requested one. Pinned skips
surface NeedMore so the loader retries (and ultimately logs a clear
warning), instead of silently allowing the rule to be violated.
- Distributed scheduler (core/services/nodes): soft anti-affinity hint —
scheduleNewModel prefers nodes that don't already host a same-group
model, falling back to eviction only if every candidate has a conflict.
Composes with NodeSelector at the same point in the candidate pipeline.
Per-node, not cluster-wide: VRAM is a node-local resource, and two heavy
models running on different nodes is fine. The ConfigLoader is wired into
SmartRouter via a small ConcurrencyConflictResolver interface so the nodes
package keeps a narrow surface on core/config.
Refactors the inner LRU eviction body into a shared collectEvictionsLocked
helper and the loader retry loop into retryEnforce(fn, maxRetries, interval),
so both LRU and group enforcement share busy/pinned/retry semantics.
Closes #9659.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(watchdog): sync pinned + concurrency_groups at startup
The startup-time watchdog setup lives in initializeWatchdog (startup.go),
not in startWatchdog (watchdog.go). The latter is only invoked from the
runtime-settings RestartWatchdog path. As a result, neither
SyncPinnedModelsToWatchdog nor SyncModelGroupsToWatchdog ran at boot,
so `pinned: true` and `concurrency_groups: [...]` only became effective
after a settings-driven watchdog restart.
Fix by adding both sync calls to initializeWatchdog. Confirmed end-to-end:
loading model A in group "heavy", then C with no group (coexists),
then B in group "heavy" now correctly evicts A and leaves [B, C].
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(test): satisfy errcheck on new os.Remove in concurrency_groups spec
CI lint runs new-from-merge-base, so the existing pre-existing
`defer os.Remove(tmp.Name())` lines are baseline-grandfathered but the
one introduced by the concurrency_groups YAML round-trip test is held
to errcheck. Wrap the remove in a closure that discards the error.
Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-05 06:42:50 +00:00
|
|
|
Unloader: remoteUnloader,
|
|
|
|
|
FileStager: fileStager,
|
|
|
|
|
GalleriesJSON: routerGalleriesJSON,
|
|
|
|
|
AuthToken: routerAuthToken,
|
|
|
|
|
DB: authDB,
|
|
|
|
|
ConflictResolver: conflictResolver,
|
2026-03-29 22:47:27 +00:00
|
|
|
})
|
|
|
|
|
|
feat(distributed): sync state with frontends, better backend management reporting (#9426)
* fix(distributed): detect backend upgrades across worker nodes
Before this change `DistributedBackendManager.CheckUpgrades` delegated to the
local manager, which read backends from the frontend filesystem. In
distributed deployments the frontend has no backends installed locally —
they live on workers — so the upgrade-detection loop never ran and the UI
silently never surfaced upgrades even when the gallery advertised newer
versions or digests.
Worker-side: NATS backend.list reply now carries Version, URI and Digest
for each installed backend (read from metadata.json).
Frontend-side: DistributedBackendManager.ListBackends aggregates per-node
refs (name, status, version, digest) instead of deduping, and CheckUpgrades
feeds that aggregation into gallery.CheckUpgradesAgainst — a new entrypoint
factored out of CheckBackendUpgrades so both paths share the same core
logic.
Cluster drift policy: when per-node version/digest tuples disagree, the
backend is flagged upgradeable regardless of whether any single node
matches the gallery, and UpgradeInfo.NodeDrift enumerates the outliers so
operators can see *why* it is out of sync. The next upgrade-all realigns
the cluster.
Tests cover: drift detection, unanimous-match (no upgrade), and the
empty-installed-version path that the old distributed code silently
missed.
* feat(ui): surface backend upgrades in the System page
The System page (Manage.jsx) only showed updates as a tiny inline arrow,
so operators routinely missed them. Port the Backend Gallery's upgrade UX
so System speaks the same visual language:
- Yellow banner at the top of the Backends tab when upgrades are pending,
with an "Upgrade all" button (serial fan-out, matches the gallery) and a
"Updates only" filter toggle.
- Warning pill (↑ N) next to the tab label so the count is glanceable even
when the banner is scrolled out of view.
- Per-row labeled "Upgrade to vX.Y" button (replaces the icon-only button
that silently flipped semantics between Reinstall and Upgrade), plus an
"Update available" badge in the new Version column.
- New columns: Version (with upgrade + drift chips), Nodes (per-node
attribution badges for distributed mode, degrading to a compact
"on N nodes · M offline" chip above three nodes), Installed (relative
time).
- System backends render a "Protected" chip instead of a bare "—" so rows
still align and the reason is obvious.
- Delete uses the softer btn-danger-ghost so rows don't scream red; the
ConfirmDialog still owns the "are you sure".
The upgrade checker also needed the same per-worker fix as the previous
commit: NewUpgradeChecker now takes a BackendManager getter so its
periodic runs call the distributed CheckUpgrades (which asks workers)
instead of the empty frontend filesystem. Without this the /api/backends/
upgrades endpoint stayed empty in distributed mode even with the protocol
change in place.
New CSS primitives — .upgrade-banner, .tab-pill, .badge-row, .cell-stack,
.cell-mono, .cell-muted, .row-actions, .btn-danger-ghost — all live in
App.css so other pages can adopt them without duplicating styles.
* feat(ui): polish the Nodes page so it reads like a product
The Nodes page was the biggest visual liability in distributed mode.
Rework the main dashboard surfaces in place without changing behavior:
StatCards: uniform height (96px min), left accent bar colored by the
metric's semantic (success/warning/error/primary), icon lives in a
36x36 soft-tinted chip top-right, value is left-aligned and large.
Grid auto-fills so the row doesn't collapse on narrow viewports. This
replaces the previous thin-bordered boxes with inconsistent heights.
Table rows: expandable rows now show a chevron cue on the left (rotates
on expand) so users know rows open. Status cell became a dedicated chip
with an LED-style halo dot instead of a bare bullet. Action buttons gained
labels — "Approve", "Resume", "Drain" — so the icons aren't doing all
the semantic work; the destructive remove action uses the softer
btn-danger-ghost variant so rows don't scream red, with the ConfirmDialog
still owning the real "are you sure". Applied cell-mono/cell-muted
utility classes so label chips and addresses share one spacing/font
grammar instead of re-declaring inline styles everywhere.
Expanded drawer: empty states for Loaded Models and Installed Backends
now render as a proper drawer-empty card (dashed border, icon, one-line
hint) instead of a plain muted string that read like broken formatting.
Tabs: three inline-styled buttons became the shared .tab class so they
inherit focus ring, hover state, and the rest of the design system —
matches the System page.
"Add more workers" toggle turned into a .nodes-add-worker dashed-border
button labelled "Register a new worker" (action voice) instead of a
chevron + muted link that operators kept mistaking for broken text.
New shared CSS primitives carry over to other pages:
.stat-grid + .stat-card, .row-chevron, .node-status, .drawer-empty,
.nodes-add-worker.
* feat(distributed): durable backend fan-out + state reconciliation
Two connected problems handled together:
1) Backend delete/install/upgrade used to silently skip non-healthy nodes,
so a delete during an outage left a zombie on the offline node once it
returned. The fan-out now records intent in a new pending_backend_ops
table before attempting the NATS round-trip. Currently-healthy nodes
get an immediate attempt; everyone else is queued. Unique index on
(node_id, backend, op) means reissuing the same operation refreshes
next_retry_at instead of stacking duplicates.
2) Loaded-model state could drift from reality: a worker OOM'd, got
killed, or restarted a backend process would leave a node_models row
claiming the model was still loaded, feeding ghost entries into the
/api/nodes/models listing and the router's scheduling decisions.
The existing ReplicaReconciler gains two new passes that run under a
fresh KeyStateReconciler advisory lock (non-blocking, so one wedged
frontend doesn't freeze the cluster):
- drainPendingBackendOps: retries queued ops whose next_retry_at has
passed on currently-healthy nodes. Success deletes the row; failure
bumps attempts and pushes next_retry_at out with exponential backoff
(30s → 15m cap). ErrNoResponders also marks the node unhealthy.
- probeLoadedModels: gRPC-HealthChecks addresses the DB thinks are
loaded but hasn't seen touched in the last probeStaleAfter (2m).
Unreachable addresses are removed from the registry. A pluggable
ModelProber lets tests substitute a fake without standing up gRPC.
DistributedBackendManager exposes DeleteBackendDetailed so the HTTP
handler can surface per-node outcomes ("2 succeeded, 1 queued") to the
UI in a follow-up commit; the existing DeleteBackend still returns
error-only for callers that don't care about node breakdown.
Multi-frontend safety: the state pass uses advisorylock.TryWithLockCtx
on a new key so N frontends coordinate — the same pattern the health
monitor and replica reconciler already rely on. Single-node mode runs
both passes inline (adapter is nil, state drain is a no-op).
Tests cover the upsert semantics, backoff math, the probe removing an
unreachable model but keeping a reachable one, and filtering by
probeStaleAfter.
* feat(ui): show cluster distribution of models in the System page
When a frontend restarted in distributed mode, models that workers had
already loaded weren't visible until the operator clicked into each node
manually — the /api/models/capabilities endpoint only knew about
configs on the frontend's filesystem, not the registry-backed truth.
/api/models/capabilities now joins in ListAllLoadedModels() when the
registry is active, returning loaded_on[] with node id/name/state/status
for each model. Models that live in the registry but lack a local config
(the actual ghosts, not recovered from the frontend's file cache) still
surface with source="registry-only" so operators can see and persist
them; without that emission they'd be invisible to this frontend.
Manage → Models replaces the old Running/Idle pill with a distribution
cell that lists the first three nodes the model is loaded on as chips
colored by state (green loaded, blue loading, amber anything else). On
wider clusters the remaining count collapses into a +N chip with a
title-attribute breakdown. Disabled / single-node behavior unchanged.
Adopted models get an extra "Adopted" ghost-icon chip with hover copy
explaining what it means and how to make it permanent.
Distributed mode also enables a 10s auto-refresh and a "Last synced Xs
ago" indicator next to the Update button so ghost rows drop off within
one reconcile tick after their owning process dies. Non-distributed
mode is untouched — no polling, no cell-stack, same old Running/Idle.
* feat(ui): NodeDistributionChip — shared per-node attribution component
Large clusters were going to break the Manage → Backends Nodes column:
the old inline logic rendered every node as a badge and would shred the
layout at >10 workers, plus the Manage → Models distribution cell had
copy-pasted its own slightly-different version.
NodeDistributionChip handles any cluster size with two render modes:
- small (≤3 nodes): inline chips of node names, colored by health.
- large: a single "on N nodes · M offline · K drift" summary chip;
clicking opens a Popover with a per-node table (name, status,
version, digest for backends; name, status, state for models).
Drift counting mirrors the backend's summarizeNodeDrift so the UI
number matches UpgradeInfo.NodeDrift. Digests are truncated to the
docker-style 12-char form with the full value preserved in the title.
Popover is a new general-purpose primitive: fixed positioning anchored
to the trigger, flips above when there's no room below, closes on
outside-click or Escape, returns focus to the trigger. Uses .card as
its surface so theming is inherited. Also useful for a future
labels-editor popup and the user menu.
Manage.jsx drops its duplicated inline Nodes-column + loaded_on cell
and uses the shared chip with context="backends" / "models"
respectively. Delete code removes ~40 lines of ad-hoc logic.
* feat(ui): shared FilterBar across the System page tabs
The Backends gallery had a nice search + chip + toggle strip; the System
page had nothing, so the two surfaces felt like different apps. Lift the
pattern into a reusable FilterBar and wire both System tabs through it.
New component core/http/react-ui/src/components/FilterBar.jsx renders a
search input, a role="tablist" chip row (aria-selected for a11y), and
optional toggles / right slot. Chips support an optional `count` which
the System page uses to show "User 3", "Updates 1" etc.
System Models tab: search by id or backend; chips for
All/Running/Idle/Disabled/Pinned plus a conditional Distributed chip in
distributed mode. "Last synced" + Update button live in the right slot.
System Backends tab: search by name/alias/meta-backend-for; chips for
All/User/System/Meta plus conditional Updates / Offline-nodes chips
when relevant. The old ad-hoc "Updates only" toggle from the upgrade
banner folded into the Updates chip — one source of truth for that
filter. Offline chip only appears in distributed mode when at least
one backend has an unhealthy node, so the chip row stays quiet on
healthy clusters.
Filter state persists in URL query params (mq/mf/bq/bf) so deep links
and tab switches keep the operator's filter context instead of
resetting every time.
Also adds an "Adopted" distribution path: when a model in
/api/models/capabilities carries source="registry-only" (discovered on
a worker but not configured locally), the Models tab shows a ghost chip
labelled "Adopted" with hover copy explaining how to persist it — this
is what closes the loop on the ghost-model story end-to-end.
2026-04-19 15:55:53 +00:00
|
|
|
// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
|
|
|
|
|
// RegistrationToken feed the state-reconciliation passes: pending op
|
|
|
|
|
// drain uses the adapter, and model health probes use the token to auth
|
|
|
|
|
// against workers' gRPC HealthCheck.
|
2026-03-31 06:28:56 +00:00
|
|
|
reconciler := nodes.NewReplicaReconciler(nodes.ReplicaReconcilerOptions{
|
feat(distributed): sync state with frontends, better backend management reporting (#9426)
* fix(distributed): detect backend upgrades across worker nodes
Before this change `DistributedBackendManager.CheckUpgrades` delegated to the
local manager, which read backends from the frontend filesystem. In
distributed deployments the frontend has no backends installed locally —
they live on workers — so the upgrade-detection loop never ran and the UI
silently never surfaced upgrades even when the gallery advertised newer
versions or digests.
Worker-side: NATS backend.list reply now carries Version, URI and Digest
for each installed backend (read from metadata.json).
Frontend-side: DistributedBackendManager.ListBackends aggregates per-node
refs (name, status, version, digest) instead of deduping, and CheckUpgrades
feeds that aggregation into gallery.CheckUpgradesAgainst — a new entrypoint
factored out of CheckBackendUpgrades so both paths share the same core
logic.
Cluster drift policy: when per-node version/digest tuples disagree, the
backend is flagged upgradeable regardless of whether any single node
matches the gallery, and UpgradeInfo.NodeDrift enumerates the outliers so
operators can see *why* it is out of sync. The next upgrade-all realigns
the cluster.
Tests cover: drift detection, unanimous-match (no upgrade), and the
empty-installed-version path that the old distributed code silently
missed.
* feat(ui): surface backend upgrades in the System page
The System page (Manage.jsx) only showed updates as a tiny inline arrow,
so operators routinely missed them. Port the Backend Gallery's upgrade UX
so System speaks the same visual language:
- Yellow banner at the top of the Backends tab when upgrades are pending,
with an "Upgrade all" button (serial fan-out, matches the gallery) and a
"Updates only" filter toggle.
- Warning pill (↑ N) next to the tab label so the count is glanceable even
when the banner is scrolled out of view.
- Per-row labeled "Upgrade to vX.Y" button (replaces the icon-only button
that silently flipped semantics between Reinstall and Upgrade), plus an
"Update available" badge in the new Version column.
- New columns: Version (with upgrade + drift chips), Nodes (per-node
attribution badges for distributed mode, degrading to a compact
"on N nodes · M offline" chip above three nodes), Installed (relative
time).
- System backends render a "Protected" chip instead of a bare "—" so rows
still align and the reason is obvious.
- Delete uses the softer btn-danger-ghost so rows don't scream red; the
ConfirmDialog still owns the "are you sure".
The upgrade checker also needed the same per-worker fix as the previous
commit: NewUpgradeChecker now takes a BackendManager getter so its
periodic runs call the distributed CheckUpgrades (which asks workers)
instead of the empty frontend filesystem. Without this the /api/backends/
upgrades endpoint stayed empty in distributed mode even with the protocol
change in place.
New CSS primitives — .upgrade-banner, .tab-pill, .badge-row, .cell-stack,
.cell-mono, .cell-muted, .row-actions, .btn-danger-ghost — all live in
App.css so other pages can adopt them without duplicating styles.
* feat(ui): polish the Nodes page so it reads like a product
The Nodes page was the biggest visual liability in distributed mode.
Rework the main dashboard surfaces in place without changing behavior:
StatCards: uniform height (96px min), left accent bar colored by the
metric's semantic (success/warning/error/primary), icon lives in a
36x36 soft-tinted chip top-right, value is left-aligned and large.
Grid auto-fills so the row doesn't collapse on narrow viewports. This
replaces the previous thin-bordered boxes with inconsistent heights.
Table rows: expandable rows now show a chevron cue on the left (rotates
on expand) so users know rows open. Status cell became a dedicated chip
with an LED-style halo dot instead of a bare bullet. Action buttons gained
labels — "Approve", "Resume", "Drain" — so the icons aren't doing all
the semantic work; the destructive remove action uses the softer
btn-danger-ghost variant so rows don't scream red, with the ConfirmDialog
still owning the real "are you sure". Applied cell-mono/cell-muted
utility classes so label chips and addresses share one spacing/font
grammar instead of re-declaring inline styles everywhere.
Expanded drawer: empty states for Loaded Models and Installed Backends
now render as a proper drawer-empty card (dashed border, icon, one-line
hint) instead of a plain muted string that read like broken formatting.
Tabs: three inline-styled buttons became the shared .tab class so they
inherit focus ring, hover state, and the rest of the design system —
matches the System page.
"Add more workers" toggle turned into a .nodes-add-worker dashed-border
button labelled "Register a new worker" (action voice) instead of a
chevron + muted link that operators kept mistaking for broken text.
New shared CSS primitives carry over to other pages:
.stat-grid + .stat-card, .row-chevron, .node-status, .drawer-empty,
.nodes-add-worker.
* feat(distributed): durable backend fan-out + state reconciliation
Two connected problems handled together:
1) Backend delete/install/upgrade used to silently skip non-healthy nodes,
so a delete during an outage left a zombie on the offline node once it
returned. The fan-out now records intent in a new pending_backend_ops
table before attempting the NATS round-trip. Currently-healthy nodes
get an immediate attempt; everyone else is queued. Unique index on
(node_id, backend, op) means reissuing the same operation refreshes
next_retry_at instead of stacking duplicates.
2) Loaded-model state could drift from reality: a worker OOM'd, got
killed, or restarted a backend process would leave a node_models row
claiming the model was still loaded, feeding ghost entries into the
/api/nodes/models listing and the router's scheduling decisions.
The existing ReplicaReconciler gains two new passes that run under a
fresh KeyStateReconciler advisory lock (non-blocking, so one wedged
frontend doesn't freeze the cluster):
- drainPendingBackendOps: retries queued ops whose next_retry_at has
passed on currently-healthy nodes. Success deletes the row; failure
bumps attempts and pushes next_retry_at out with exponential backoff
(30s → 15m cap). ErrNoResponders also marks the node unhealthy.
- probeLoadedModels: gRPC-HealthChecks addresses the DB thinks are
loaded but hasn't seen touched in the last probeStaleAfter (2m).
Unreachable addresses are removed from the registry. A pluggable
ModelProber lets tests substitute a fake without standing up gRPC.
DistributedBackendManager exposes DeleteBackendDetailed so the HTTP
handler can surface per-node outcomes ("2 succeeded, 1 queued") to the
UI in a follow-up commit; the existing DeleteBackend still returns
error-only for callers that don't care about node breakdown.
Multi-frontend safety: the state pass uses advisorylock.TryWithLockCtx
on a new key so N frontends coordinate — the same pattern the health
monitor and replica reconciler already rely on. Single-node mode runs
both passes inline (adapter is nil, state drain is a no-op).
Tests cover the upsert semantics, backoff math, the probe removing an
unreachable model but keeping a reachable one, and filtering by
probeStaleAfter.
* feat(ui): show cluster distribution of models in the System page
When a frontend restarted in distributed mode, models that workers had
already loaded weren't visible until the operator clicked into each node
manually — the /api/models/capabilities endpoint only knew about
configs on the frontend's filesystem, not the registry-backed truth.
/api/models/capabilities now joins in ListAllLoadedModels() when the
registry is active, returning loaded_on[] with node id/name/state/status
for each model. Models that live in the registry but lack a local config
(the actual ghosts, not recovered from the frontend's file cache) still
surface with source="registry-only" so operators can see and persist
them; without that emission they'd be invisible to this frontend.
Manage → Models replaces the old Running/Idle pill with a distribution
cell that lists the first three nodes the model is loaded on as chips
colored by state (green loaded, blue loading, amber anything else). On
wider clusters the remaining count collapses into a +N chip with a
title-attribute breakdown. Disabled / single-node behavior unchanged.
Adopted models get an extra "Adopted" ghost-icon chip with hover copy
explaining what it means and how to make it permanent.
Distributed mode also enables a 10s auto-refresh and a "Last synced Xs
ago" indicator next to the Update button so ghost rows drop off within
one reconcile tick after their owning process dies. Non-distributed
mode is untouched — no polling, no cell-stack, same old Running/Idle.
* feat(ui): NodeDistributionChip — shared per-node attribution component
Large clusters were going to break the Manage → Backends Nodes column:
the old inline logic rendered every node as a badge and would shred the
layout at >10 workers, plus the Manage → Models distribution cell had
copy-pasted its own slightly-different version.
NodeDistributionChip handles any cluster size with two render modes:
- small (≤3 nodes): inline chips of node names, colored by health.
- large: a single "on N nodes · M offline · K drift" summary chip;
clicking opens a Popover with a per-node table (name, status,
version, digest for backends; name, status, state for models).
Drift counting mirrors the backend's summarizeNodeDrift so the UI
number matches UpgradeInfo.NodeDrift. Digests are truncated to the
docker-style 12-char form with the full value preserved in the title.
Popover is a new general-purpose primitive: fixed positioning anchored
to the trigger, flips above when there's no room below, closes on
outside-click or Escape, returns focus to the trigger. Uses .card as
its surface so theming is inherited. Also useful for a future
labels-editor popup and the user menu.
Manage.jsx drops its duplicated inline Nodes-column + loaded_on cell
and uses the shared chip with context="backends" / "models"
respectively. Delete code removes ~40 lines of ad-hoc logic.
* feat(ui): shared FilterBar across the System page tabs
The Backends gallery had a nice search + chip + toggle strip; the System
page had nothing, so the two surfaces felt like different apps. Lift the
pattern into a reusable FilterBar and wire both System tabs through it.
New component core/http/react-ui/src/components/FilterBar.jsx renders a
search input, a role="tablist" chip row (aria-selected for a11y), and
optional toggles / right slot. Chips support an optional `count` which
the System page uses to show "User 3", "Updates 1" etc.
System Models tab: search by id or backend; chips for
All/Running/Idle/Disabled/Pinned plus a conditional Distributed chip in
distributed mode. "Last synced" + Update button live in the right slot.
System Backends tab: search by name/alias/meta-backend-for; chips for
All/User/System/Meta plus conditional Updates / Offline-nodes chips
when relevant. The old ad-hoc "Updates only" toggle from the upgrade
banner folded into the Updates chip — one source of truth for that
filter. Offline chip only appears in distributed mode when at least
one backend has an unhealthy node, so the chip row stays quiet on
healthy clusters.
Filter state persists in URL query params (mq/mf/bq/bf) so deep links
and tab switches keep the operator's filter context instead of
resetting every time.
Also adds an "Adopted" distribution path: when a model in
/api/models/capabilities carries source="registry-only" (discovered on
a worker but not configured locally), the Models tab shows a ghost chip
labelled "Adopted" with hover copy explaining how to persist it — this
is what closes the loop on the ghost-model story end-to-end.
2026-04-19 15:55:53 +00:00
|
|
|
Registry: registry,
|
|
|
|
|
Scheduler: router,
|
|
|
|
|
Unloader: remoteUnloader,
|
|
|
|
|
Adapter: remoteUnloader,
|
|
|
|
|
RegistrationToken: cfg.Distributed.RegistrationToken,
|
|
|
|
|
DB: authDB,
|
|
|
|
|
Interval: 30 * time.Second,
|
|
|
|
|
ScaleDownDelay: 5 * time.Minute,
|
|
|
|
|
ProbeStaleAfter: 2 * time.Minute,
|
2026-03-31 06:28:56 +00:00
|
|
|
})
|
|
|
|
|
|
2026-03-29 22:47:27 +00:00
|
|
|
// Create ModelRouterAdapter to wire into ModelLoader
|
|
|
|
|
modelAdapter := nodes.NewModelRouterAdapter(router)
|
|
|
|
|
|
|
|
|
|
success = true
|
|
|
|
|
return &DistributedServices{
|
|
|
|
|
Nats: natsClient,
|
|
|
|
|
Store: store,
|
|
|
|
|
Registry: registry,
|
|
|
|
|
Router: router,
|
|
|
|
|
Health: healthMon,
|
2026-03-31 06:28:56 +00:00
|
|
|
Reconciler: reconciler,
|
2026-03-29 22:47:27 +00:00
|
|
|
JobStore: jobStore,
|
|
|
|
|
Dispatcher: dispatcher,
|
|
|
|
|
AgentStore: agentStore,
|
|
|
|
|
AgentBridge: agentBridge,
|
|
|
|
|
DistStores: distStores,
|
|
|
|
|
FileMgr: fileMgr,
|
|
|
|
|
FileStager: fileStager,
|
|
|
|
|
ModelAdapter: modelAdapter,
|
|
|
|
|
Unloader: remoteUnloader,
|
|
|
|
|
}, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func isPostgresURL(url string) bool {
|
|
|
|
|
return strings.HasPrefix(url, "postgres://") || strings.HasPrefix(url, "postgresql://")
|
|
|
|
|
}
|