fix(nats): improve error handling (#9222)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2026-04-04 12:11:54 +02:00 committed by GitHub
parent 9f8821bba8
commit 223deb908d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 37 additions and 4 deletions

View file

@ -692,13 +692,13 @@ func (s *backendSupervisor) subscribeLifecycleEvents() {
// backend.delete — stop backend + delete files (request-reply)
s.nats.SubscribeReply(messaging.SubjectNodeBackendDelete(s.nodeID), func(data []byte, reply func([]byte)) {
xlog.Info("Received NATS backend.delete event")
var req messaging.BackendDeleteRequest
if err := json.Unmarshal(data, &req); err != nil {
resp := messaging.BackendDeleteReply{Success: false, Error: fmt.Sprintf("invalid request: %v", err)}
replyJSON(reply, resp)
return
}
xlog.Info("Received NATS backend.delete event", "backend", req.Backend)
// Stop if running this backend
if s.isRunning(req.Backend) {

View file

@ -300,14 +300,29 @@ func DeleteBackendFromSystem(systemState *system.SystemState, name string) error
backend, ok := backends.Get(name)
if !ok {
return fmt.Errorf("backend %q: %w", name, ErrBackendNotFound)
// Not found by direct key — try matching by gallery name (metadata.Name)
// The UI may send gallery-style names like "localai@llama-cpp" which
// don't match the directory-based keys used in the backends map.
for _, b := range backends {
if b.Metadata != nil && b.Metadata.Name == name && !b.IsMeta {
backend = b
ok = true
break
}
}
if !ok {
return fmt.Errorf("backend %q: %w", name, ErrBackendNotFound)
}
}
if backend.IsSystem {
return fmt.Errorf("system backend %q cannot be deleted", name)
}
backendDirectory := filepath.Join(systemState.Backend.BackendsPath, name)
// Use the backend's actual Name (directory key) for path resolution,
// not the caller-supplied name which may be a gallery-style name.
dirName := backend.Name
backendDirectory := filepath.Join(systemState.Backend.BackendsPath, dirName)
// check if the backend dir exists
if _, err := os.Stat(backendDirectory); os.IsNotExist(err) {
@ -325,7 +340,7 @@ func DeleteBackendFromSystem(systemState *system.SystemState, name string) error
if err != nil {
return err
}
if metadata != nil && metadata.Alias == name {
if metadata != nil && (metadata.Alias == name || metadata.Alias == dirName) {
backendDirectory = filepath.Join(systemState.Backend.BackendsPath, backend.Name())
foundBackend = true
break

View file

@ -11,6 +11,7 @@ import (
"github.com/mudler/LocalAI/core/services/galleryop"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
"github.com/nats-io/nats.go"
)
// DistributedModelManager wraps a local ModelManager and adds NATS fan-out
@ -84,6 +85,13 @@ func (d *DistributedBackendManager) DeleteBackend(name string) error {
continue
}
if _, delErr := d.adapter.DeleteBackend(node.ID, name); delErr != nil {
if errors.Is(delErr, nats.ErrNoResponders) {
// Node's NATS subscription is gone — likely restarted with a new ID.
// Mark it unhealthy so future fan-outs skip it.
xlog.Warn("No NATS responders for node, marking unhealthy", "node", node.Name, "nodeID", node.ID)
d.registry.MarkUnhealthy(context.Background(), node.ID)
continue
}
xlog.Warn("Failed to propagate backend deletion to worker", "node", node.Name, "backend", name, "error", delErr)
errs = append(errs, fmt.Errorf("node %s: %w", node.Name, delErr))
}
@ -105,6 +113,11 @@ func (d *DistributedBackendManager) ListBackends() (gallery.SystemBackends, erro
}
reply, err := d.adapter.ListBackends(node.ID)
if err != nil {
if errors.Is(err, nats.ErrNoResponders) {
xlog.Warn("No NATS responders for node, marking unhealthy", "node", node.Name, "nodeID", node.ID)
d.registry.MarkUnhealthy(context.Background(), node.ID)
continue
}
xlog.Warn("Failed to list backends on worker", "node", node.Name, "error", err)
continue
}
@ -145,6 +158,11 @@ func (d *DistributedBackendManager) InstallBackend(ctx context.Context, op *gall
}
reply, err := d.adapter.InstallBackend(node.ID, backendName, "", string(galleriesJSON))
if err != nil {
if errors.Is(err, nats.ErrNoResponders) {
xlog.Warn("No NATS responders for node, marking unhealthy", "node", node.Name, "nodeID", node.ID)
d.registry.MarkUnhealthy(context.Background(), node.ID)
continue
}
xlog.Warn("Failed to install backend on worker", "node", node.Name, "backend", backendName, "error", err)
continue
}