2023-04-11 22:02:39 +00:00
package model
2023-04-07 09:30:59 +00:00
import (
2023-07-14 23:19:43 +00:00
"context"
2023-04-07 09:30:59 +00:00
"fmt"
2025-06-15 12:56:52 +00:00
"maps"
2023-04-07 09:30:59 +00:00
"os"
"path/filepath"
2023-04-10 10:02:40 +00:00
"strings"
2023-04-07 09:30:59 +00:00
"sync"
2026-03-18 07:31:26 +00:00
"sync/atomic"
2024-08-30 13:20:39 +00:00
"time"
2023-04-07 09:30:59 +00:00
2026-03-29 22:47:27 +00:00
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
2025-08-14 17:38:26 +00:00
"github.com/mudler/LocalAI/pkg/system"
2024-06-23 08:24:36 +00:00
"github.com/mudler/LocalAI/pkg/utils"
2024-04-19 02:40:18 +00:00
2025-12-21 18:33:13 +00:00
"github.com/mudler/xlog"
2023-04-07 09:30:59 +00:00
)
2023-07-22 15:31:39 +00:00
// new idea: what if we declare a struct of these here, and use a loop to check?
2025-05-16 10:45:48 +00:00
// TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we separate directories for .bin/.yaml and .tmpl
2026-03-11 06:30:49 +00:00
// ModelUnloadHook is called when a model is about to be unloaded.
// The model name is passed as the argument.
type ModelUnloadHook func ( modelName string )
2026-03-29 22:47:27 +00:00
// RemoteModelUnloader handles unloading models from remote backend nodes.
// In distributed mode, this is implemented by the SmartRouter.
// When ShutdownModel is called for a model with no local process,
// RemoteModelUnloader.UnloadRemoteModel is called to tell the remote node to free it.
type RemoteModelUnloader interface {
UnloadRemoteModel ( modelName string ) error
}
// ModelRouter is a callback that routes model loading to a remote node
// instead of starting a local process. When set on the ModelLoader,
// grpcModel() will delegate to this function before attempting local loading.
type ModelRouter func ( ctx context . Context , backend , modelID , modelName , modelFile string ,
opts * pb . ModelOptions , parallel bool ) ( * Model , error )
2023-04-07 09:30:59 +00:00
type ModelLoader struct {
2025-12-25 13:26:18 +00:00
ModelPath string
mu sync . Mutex
2026-03-29 22:47:27 +00:00
store ModelStore
2025-12-25 13:26:18 +00:00
loading map [ string ] chan struct { } // tracks models currently being loaded
wd * WatchDog
externalBackends map [ string ] string
lruEvictionMaxRetries int // Maximum number of retries when waiting for busy models
lruEvictionRetryInterval time . Duration // Interval between retries when waiting for busy models
2026-03-11 06:30:49 +00:00
onUnloadHooks [ ] ModelUnloadHook
2026-03-29 22:47:27 +00:00
remoteUnloader RemoteModelUnloader
modelRouter ModelRouter // distributed mode: route to remote node
2026-03-18 07:31:26 +00:00
backendLogs * BackendLogStore
backendLoggingEnabled atomic . Bool
2023-04-07 09:30:59 +00:00
}
2025-12-12 11:28:38 +00:00
// NewModelLoader creates a new ModelLoader instance.
// LRU eviction is now managed through the WatchDog component.
func NewModelLoader ( system * system . SystemState ) * ModelLoader {
2023-07-22 15:31:39 +00:00
nml := & ModelLoader {
2025-12-25 13:26:18 +00:00
ModelPath : system . Model . ModelsPath ,
2026-03-29 22:47:27 +00:00
store : NewInMemoryModelStore ( ) ,
2025-12-25 13:26:18 +00:00
loading : make ( map [ string ] chan struct { } ) ,
externalBackends : make ( map [ string ] string ) ,
lruEvictionMaxRetries : 30 , // Default: 30 retries
lruEvictionRetryInterval : 1 * time . Second , // Default: 1 second
2026-03-18 07:31:26 +00:00
backendLogs : NewBackendLogStore ( 1000 ) ,
2023-04-20 22:06:55 +00:00
}
2023-11-26 17:36:23 +00:00
2023-07-22 15:31:39 +00:00
return nml
2023-04-07 09:30:59 +00:00
}
2025-12-12 11:28:38 +00:00
// GetLoadingCount returns the number of models currently being loaded
func ( ml * ModelLoader ) GetLoadingCount ( ) int {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
return len ( ml . loading )
}
2026-03-11 06:30:49 +00:00
// OnModelUnload registers a hook that is called when a model is unloaded.
func ( ml * ModelLoader ) OnModelUnload ( hook ModelUnloadHook ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . onUnloadHooks = append ( ml . onUnloadHooks , hook )
}
2023-11-26 17:36:23 +00:00
func ( ml * ModelLoader ) SetWatchDog ( wd * WatchDog ) {
2026-03-29 22:47:27 +00:00
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
2023-11-26 17:36:23 +00:00
ml . wd = wd
}
2026-03-29 22:47:27 +00:00
// SetRemoteUnloader sets the handler for unloading models on remote nodes.
// In distributed mode, this should be set to the SmartRouter adapter.
func ( ml * ModelLoader ) SetRemoteUnloader ( u RemoteModelUnloader ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . remoteUnloader = u
}
// SetModelRouter sets the distributed model router callback.
// When set, grpcModel() will delegate to this function before attempting local loading.
func ( ml * ModelLoader ) SetModelRouter ( r ModelRouter ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . modelRouter = r
}
// SetModelStore replaces the default in-memory model store.
// In distributed mode this is called with a DistributedModelStore.
func ( ml * ModelLoader ) SetModelStore ( s ModelStore ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . store = s
}
2025-11-20 21:37:20 +00:00
func ( ml * ModelLoader ) GetWatchDog ( ) * WatchDog {
return ml . wd
}
2026-03-18 07:31:26 +00:00
func ( ml * ModelLoader ) BackendLogs ( ) * BackendLogStore {
return ml . backendLogs
}
func ( ml * ModelLoader ) SetBackendLoggingEnabled ( enabled bool ) {
ml . backendLoggingEnabled . Store ( enabled )
}
func ( ml * ModelLoader ) BackendLoggingEnabled ( ) bool {
return ml . backendLoggingEnabled . Load ( )
}
2025-12-25 13:26:18 +00:00
// SetLRUEvictionRetrySettings updates the LRU eviction retry settings
func ( ml * ModelLoader ) SetLRUEvictionRetrySettings ( maxRetries int , retryInterval time . Duration ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . lruEvictionMaxRetries = maxRetries
ml . lruEvictionRetryInterval = retryInterval
}
2023-04-20 16:33:02 +00:00
func ( ml * ModelLoader ) ExistsInModelPath ( s string ) bool {
2024-04-19 02:40:18 +00:00
return utils . ExistsInPath ( ml . ModelPath , s )
2023-04-20 16:33:02 +00:00
}
2025-06-15 12:56:52 +00:00
func ( ml * ModelLoader ) SetExternalBackend ( name , uri string ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
ml . externalBackends [ name ] = uri
}
func ( ml * ModelLoader ) DeleteExternalBackend ( name string ) {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
delete ( ml . externalBackends , name )
}
func ( ml * ModelLoader ) GetExternalBackend ( name string ) string {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
return ml . externalBackends [ name ]
}
func ( ml * ModelLoader ) GetAllExternalBackends ( o * Options ) map [ string ] string {
backends := make ( map [ string ] string )
maps . Copy ( backends , ml . externalBackends )
if o != nil {
maps . Copy ( backends , o . externalBackends )
}
return backends
}
2024-07-10 13:28:39 +00:00
var knownFilesToSkip [ ] string = [ ] string {
"MODEL_CARD" ,
"README" ,
"README.md" ,
}
var knownModelsNameSuffixToSkip [ ] string = [ ] string {
".tmpl" ,
".keep" ,
".yaml" ,
".yml" ,
".json" ,
2024-07-18 12:44:44 +00:00
".txt" ,
2025-01-16 21:23:09 +00:00
".pt" ,
".onnx" ,
2024-07-18 12:44:44 +00:00
".md" ,
".MD" ,
2024-07-10 13:28:39 +00:00
".DS_Store" ,
"." ,
2024-11-06 11:04:39 +00:00
".safetensors" ,
2025-09-25 07:15:06 +00:00
".bin" ,
2026-02-05 09:17:46 +00:00
".gguf" ,
".ggml" ,
2026-04-26 19:37:53 +00:00
".ckpt" ,
".zip" ,
".tag" ,
2024-07-11 17:55:01 +00:00
".partial" ,
".tar.gz" ,
2024-07-10 13:28:39 +00:00
}
2024-09-17 04:50:57 +00:00
const retryTimeout = time . Duration ( 2 * time . Minute )
2024-07-10 13:28:39 +00:00
func ( ml * ModelLoader ) ListFilesInModelPath ( ) ( [ ] string , error ) {
2023-07-22 15:31:39 +00:00
files , err := os . ReadDir ( ml . ModelPath )
2023-04-10 10:02:40 +00:00
if err != nil {
return [ ] string { } , err
}
models := [ ] string { }
2024-07-10 13:28:39 +00:00
FILE :
2023-04-10 10:02:40 +00:00
for _ , file := range files {
2024-07-10 13:28:39 +00:00
for _ , skip := range knownFilesToSkip {
if strings . EqualFold ( file . Name ( ) , skip ) {
continue FILE
}
}
// Skip templates, YAML, .keep, .json, and .DS_Store files
for _ , skip := range knownModelsNameSuffixToSkip {
if strings . HasSuffix ( file . Name ( ) , skip ) {
continue FILE
}
}
// Skip directories
if file . IsDir ( ) {
2023-04-20 16:33:02 +00:00
continue
2023-04-10 10:02:40 +00:00
}
2023-04-20 16:33:02 +00:00
models = append ( models , file . Name ( ) )
2023-04-10 10:02:40 +00:00
}
return models , nil
}
2025-08-16 05:44:50 +00:00
func ( ml * ModelLoader ) ListLoadedModels ( ) [ ] * Model {
2024-08-30 13:20:39 +00:00
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
2024-11-14 13:12:29 +00:00
models := [ ] * Model { }
2026-03-29 22:47:27 +00:00
ml . store . Range ( func ( _ string , m * Model ) bool {
models = append ( models , m )
return true
} )
2024-08-30 13:20:39 +00:00
return models
}
2024-10-02 06:55:58 +00:00
func ( ml * ModelLoader ) LoadModel ( modelID , modelName string , loader func ( string , string , string ) ( * Model , error ) ) ( * Model , error ) {
2025-10-16 19:28:19 +00:00
ml . mu . Lock ( )
2023-05-10 23:12:58 +00:00
// Check if we already have a loaded model
2025-10-16 19:28:19 +00:00
if model := ml . checkIsLoaded ( modelID ) ; model != nil {
2025-12-12 11:28:38 +00:00
ml . mu . Unlock ( )
2023-07-14 23:19:43 +00:00
return model , nil
2023-04-19 15:10:29 +00:00
}
2023-04-20 16:33:02 +00:00
2025-12-12 11:28:38 +00:00
// Check if another goroutine is already loading this model
if loadingChan , isLoading := ml . loading [ modelID ] ; isLoading {
ml . mu . Unlock ( )
// Wait for the other goroutine to finish loading
2025-12-21 18:33:13 +00:00
xlog . Debug ( "Waiting for model to be loaded by another request" , "modelID" , modelID )
2025-12-12 11:28:38 +00:00
<- loadingChan
// Now check if the model is loaded
ml . mu . Lock ( )
model := ml . checkIsLoaded ( modelID )
ml . mu . Unlock ( )
if model != nil {
return model , nil
}
// If still not loaded, the other goroutine failed - we'll try again
return ml . LoadModel ( modelID , modelName , loader )
}
// Mark this model as loading (create a channel that will be closed when done)
loadingChan := make ( chan struct { } )
ml . loading [ modelID ] = loadingChan
ml . mu . Unlock ( )
// Ensure we clean up the loading state when done
defer func ( ) {
ml . mu . Lock ( )
delete ( ml . loading , modelID )
close ( loadingChan )
ml . mu . Unlock ( )
} ( )
// Load the model (this can take a long time, no lock held)
2023-04-27 04:18:18 +00:00
modelFile := filepath . Join ( ml . ModelPath , modelName )
2025-12-21 18:33:13 +00:00
xlog . Debug ( "Loading model in memory from file" , "file" , modelFile )
2023-04-20 16:33:02 +00:00
2024-10-02 06:55:58 +00:00
model , err := loader ( modelID , modelName , modelFile )
2023-04-19 15:10:29 +00:00
if err != nil {
2024-10-02 18:37:40 +00:00
return nil , fmt . Errorf ( "failed to load model with internal loader: %s" , err )
2023-04-19 15:10:29 +00:00
}
2024-08-30 13:20:39 +00:00
if model == nil {
return nil , fmt . Errorf ( "loader didn't return a model" )
}
2025-12-12 11:28:38 +00:00
// Add to models map
ml . mu . Lock ( )
2026-03-29 22:47:27 +00:00
ml . store . Set ( modelID , model )
2025-12-12 11:28:38 +00:00
ml . mu . Unlock ( )
2024-08-30 13:20:39 +00:00
2023-05-11 14:34:16 +00:00
return model , nil
2023-05-05 09:20:06 +00:00
}
2023-07-14 23:19:43 +00:00
2023-08-23 16:38:37 +00:00
func ( ml * ModelLoader ) ShutdownModel ( modelName string ) error {
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
2024-09-17 04:50:57 +00:00
return ml . deleteProcess ( modelName )
2023-08-23 16:38:37 +00:00
}
2024-08-25 12:36:09 +00:00
func ( ml * ModelLoader ) CheckIsLoaded ( s string ) * Model {
2024-09-17 14:51:40 +00:00
ml . mu . Lock ( )
defer ml . mu . Unlock ( )
2025-10-16 19:28:19 +00:00
return ml . checkIsLoaded ( s )
}
func ( ml * ModelLoader ) checkIsLoaded ( s string ) * Model {
2026-03-29 22:47:27 +00:00
m , ok := ml . store . Get ( s )
2024-08-25 12:36:09 +00:00
if ! ok {
return nil
2023-07-14 23:19:43 +00:00
}
2025-12-21 18:33:13 +00:00
xlog . Debug ( "Model already loaded in memory" , "model" , s )
2026-03-29 22:47:27 +00:00
// Skip the gRPC health check if the model was recently verified.
// This avoids serializing concurrent requests behind ml.mu while each
// one does a network round-trip (especially costly in distributed mode).
if m . IsRecentlyHealthy ( ) {
xlog . Debug ( "Model health check cached, skipping gRPC probe" , "model" , s )
return m
}
2024-08-30 13:20:39 +00:00
client := m . GRPC ( false , ml . wd )
2025-12-21 18:33:13 +00:00
xlog . Debug ( "Checking model availability" , "model" , s )
2024-08-30 13:20:39 +00:00
cTimeout , cancel := context . WithTimeout ( context . Background ( ) , 2 * time . Minute )
defer cancel ( )
alive , err := client . HealthCheck ( cTimeout )
2024-08-25 12:36:09 +00:00
if ! alive {
2025-12-21 18:33:13 +00:00
xlog . Warn ( "GRPC Model not responding" , "error" , err )
xlog . Warn ( "Deleting the process in order to recreate it" )
2024-09-26 10:44:55 +00:00
process := m . Process ( )
if process == nil {
2026-04-08 10:11:02 +00:00
// Remote/distributed model — no local process to check.
// Only evict on definitive connection errors (node is down).
// Timeouts may mean the node is busy, so keep the model cached.
if isConnectionError ( err ) {
xlog . Warn ( "Remote model unreachable (connection error), removing from cache" , "model" , s , "error" , err )
if delErr := ml . deleteProcess ( s ) ; delErr != nil {
xlog . Error ( "error cleaning up remote model" , "error" , delErr , "model" , s )
}
return nil
}
xlog . Warn ( "Remote model health check failed (possible timeout), keeping cached" , "model" , s , "error" , err )
2024-08-30 13:20:39 +00:00
return m
}
if ! process . IsAlive ( ) {
2025-12-21 18:33:13 +00:00
xlog . Debug ( "GRPC Process is not responding" , "model" , s )
2024-08-25 12:36:09 +00:00
// stop and delete the process, this forces to re-load the model and re-create again the service
err := ml . deleteProcess ( s )
if err != nil {
2025-12-21 18:33:13 +00:00
xlog . Error ( "error stopping process" , "error" , err , "process" , s )
2024-08-25 12:36:09 +00:00
}
return nil
}
2023-07-22 15:31:39 +00:00
}
2026-03-29 22:47:27 +00:00
m . MarkHealthy ( )
2024-08-25 12:36:09 +00:00
return m
2023-07-22 15:31:39 +00:00
}