feat(stable-diffusion.ggml): add support for video generation (#9420)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2026-04-19 09:26:33 +02:00 committed by GitHub
parent 6e49dba27c
commit 054c4b4b45
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 439 additions and 25 deletions

View file

@ -26,6 +26,10 @@
#include "stb_image_resize.h"
#include <stdlib.h>
#include <regex>
#include <errno.h>
#include <signal.h>
#include <unistd.h>
#include <sys/wait.h>
@ -980,6 +984,251 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
return !ret;
}
// ---------------- Video generation ----------------
sd_vid_gen_params_t* sd_vid_gen_params_new(void) {
sd_vid_gen_params_t *params = (sd_vid_gen_params_t *)std::malloc(sizeof(sd_vid_gen_params_t));
sd_vid_gen_params_init(params);
sd_sample_params_init(&params->sample_params);
sd_sample_params_init(&params->high_noise_sample_params);
sd_cache_params_init(&params->cache);
return params;
}
// Persistent storage for cleaned video prompts (kept alive for the duration of generation)
static std::string cleaned_vid_prompt_storage;
static std::string cleaned_vid_negative_prompt_storage;
void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt) {
lora_vec.clear();
lora_strings.clear();
std::string prompt_str = prompt ? prompt : "";
std::string negative_prompt_str = negative_prompt ? negative_prompt : "";
const char* lora_dir_to_use = lora_dir_path.empty() ? nullptr : lora_dir_path.c_str();
auto [loras, cleaned_prompt] = parse_loras_from_prompt(prompt_str, lora_dir_to_use);
lora_vec = loras;
cleaned_vid_prompt_storage = cleaned_prompt;
auto [neg_loras, cleaned_negative] = parse_loras_from_prompt(negative_prompt_str, lora_dir_to_use);
cleaned_vid_negative_prompt_storage = cleaned_negative;
params->prompt = cleaned_vid_prompt_storage.c_str();
params->negative_prompt = cleaned_vid_negative_prompt_storage.c_str();
params->loras = lora_vec.empty() ? nullptr : lora_vec.data();
params->lora_count = static_cast<uint32_t>(lora_vec.size());
}
void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height) {
params->width = width;
params->height = height;
}
void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed) {
params->seed = seed;
}
void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n) {
params->video_frames = n;
}
// Load an image file into an sd_image_t, resizing to target dims if needed.
// Returns a heap-allocated buffer the caller must free (or nullptr on failure).
static uint8_t* load_and_resize_image(const char* path, int target_width, int target_height, sd_image_t* out) {
if (!path || strlen(path) == 0) {
*out = {0, 0, 0, nullptr};
return nullptr;
}
int c = 0, img_w = 0, img_h = 0;
uint8_t* buf = stbi_load(path, &img_w, &img_h, &c, 3);
if (!buf) {
fprintf(stderr, "Failed to load image from '%s'\n", path);
*out = {0, 0, 0, nullptr};
return nullptr;
}
if (img_w != target_width || img_h != target_height) {
fprintf(stderr, "Resizing image from %dx%d to %dx%d\n", img_w, img_h, target_width, target_height);
uint8_t* resized = (uint8_t*)malloc((size_t)target_width * target_height * 3);
if (!resized) { free(buf); *out = {0, 0, 0, nullptr}; return nullptr; }
stbir_resize(buf, img_w, img_h, 0,
resized, target_width, target_height, 0, STBIR_TYPE_UINT8,
3, STBIR_ALPHA_CHANNEL_NONE, 0,
STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
STBIR_FILTER_BOX, STBIR_FILTER_BOX,
STBIR_COLORSPACE_SRGB, nullptr);
free(buf);
buf = resized;
}
*out = {(uint32_t)target_width, (uint32_t)target_height, 3, buf};
return buf;
}
// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
// Uses fork+execvp to avoid shell interpretation of dst.
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
if (num_frames <= 0 || !frames || !frames[0].data) {
fprintf(stderr, "ffmpeg_mux: empty frames\n");
return 1;
}
int width = (int)frames[0].width;
int height = (int)frames[0].height;
int channels = (int)frames[0].channel;
const char* pix_fmt_in = (channels == 4) ? "rgba" : "rgb24";
char size_str[32];
char fps_str[32];
snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
snprintf(fps_str, sizeof(fps_str), "%d", fps);
int pipefd[2];
if (pipe(pipefd) != 0) { perror("pipe"); return 1; }
pid_t pid = fork();
if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }
if (pid == 0) {
// child
close(pipefd[1]);
if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
close(pipefd[0]);
std::vector<char*> argv = {
const_cast<char*>("ffmpeg"),
const_cast<char*>("-y"),
const_cast<char*>("-hide_banner"),
const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
const_cast<char*>("-s"), size_str,
const_cast<char*>("-framerate"), fps_str,
const_cast<char*>("-i"), const_cast<char*>("-"),
const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
const_cast<char*>(dst),
nullptr
};
execvp(argv[0], argv.data());
perror("execvp ffmpeg");
_exit(127);
}
// parent
close(pipefd[0]);
// Ignore SIGPIPE so a dying ffmpeg surfaces via write() errno instead of killing us.
signal(SIGPIPE, SIG_IGN);
for (int i = 0; i < num_frames; i++) {
if (!frames[i].data) continue;
size_t frame_bytes = (size_t)frames[i].width * frames[i].height * frames[i].channel;
const uint8_t* p = frames[i].data;
size_t remaining = frame_bytes;
while (remaining > 0) {
ssize_t n = write(pipefd[1], p, remaining);
if (n < 0) {
if (errno == EINTR) continue;
perror("write frame to ffmpeg");
close(pipefd[1]);
int status;
waitpid(pid, &status, 0);
return 1;
}
p += n;
remaining -= (size_t)n;
}
}
close(pipefd[1]);
int status = 0;
while (waitpid(pid, &status, 0) < 0) {
if (errno != EINTR) { perror("waitpid"); return 1; }
}
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
fprintf(stderr, "ffmpeg exited with status %d\n", status);
return 1;
}
return 0;
}
int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image) {
if (!p) return 1;
if (!dst || strlen(dst) == 0) {
fprintf(stderr, "gen_video: dst is empty\n");
std::free(p);
return 1;
}
std::vector<int> skip_layers = {7, 8, 9};
fprintf(stderr, "Generating video: %dx%d, frames=%d, fps=%d, steps=%d, cfg=%.2f\n",
p->width, p->height, p->video_frames, fps, steps, cfg_scale);
// Sample params (shared by both low and high-noise passes — MoE models use the high-noise
// set during the first phase; single-model Wan2.1 ignores it. Same defaults for both is fine.)
p->sample_params.guidance.txt_cfg = cfg_scale;
p->sample_params.guidance.slg.layers = skip_layers.data();
p->sample_params.guidance.slg.layer_count = skip_layers.size();
p->sample_params.sample_method = sample_method;
p->sample_params.sample_steps = steps;
p->sample_params.scheduler = scheduler;
p->sample_params.flow_shift = flow_shift;
p->high_noise_sample_params.guidance.txt_cfg = cfg_scale;
p->high_noise_sample_params.guidance.slg.layers = skip_layers.data();
p->high_noise_sample_params.guidance.slg.layer_count = skip_layers.size();
p->high_noise_sample_params.sample_method = sample_method;
p->high_noise_sample_params.sample_steps = steps;
p->high_noise_sample_params.scheduler = scheduler;
p->high_noise_sample_params.flow_shift = flow_shift;
// Load init/end reference images if provided (resized to output dims).
uint8_t* init_buf = nullptr;
uint8_t* end_buf = nullptr;
sd_image_t init_img = {0, 0, 0, nullptr};
sd_image_t end_img = {0, 0, 0, nullptr};
if (init_image && strlen(init_image) > 0) {
init_buf = load_and_resize_image(init_image, p->width, p->height, &init_img);
if (!init_buf) { std::free(p); return 1; }
}
if (end_image && strlen(end_image) > 0) {
end_buf = load_and_resize_image(end_image, p->width, p->height, &end_img);
if (!end_buf) { if (init_buf) free(init_buf); std::free(p); return 1; }
}
p->init_image = init_img;
p->end_image = end_img;
// Generate
int num_frames_out = 0;
sd_image_t* frames = generate_video(sd_c, p, &num_frames_out);
std::free(p);
if (!frames || num_frames_out == 0) {
fprintf(stderr, "generate_video produced no frames\n");
if (init_buf) free(init_buf);
if (end_buf) free(end_buf);
return 1;
}
fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);
for (int i = 0; i < num_frames_out; i++) {
if (frames[i].data) free(frames[i].data);
}
free(frames);
if (init_buf) free(init_buf);
if (end_buf) free(end_buf);
if (rc == 0) {
fprintf(stderr, "gen_video done: %s\n", dst);
}
fflush(stderr);
return rc;
}
int unload() {
free_sd_ctx(sd_c);
return 0;

View file

@ -23,6 +23,7 @@ type SDGGML struct {
var (
LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
GenImage func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []uintptr, refImagesCount int) int
GenVideo func(params uintptr, steps int, dst string, cfgScale float32, fps int, initImage string, endImage string) int
TilingParamsSetEnabled func(params uintptr, enabled bool)
TilingParamsSetTileSizes func(params uintptr, tileSizeX int, tileSizeY int)
@ -34,6 +35,12 @@ var (
ImgGenParamsSetDimensions func(params uintptr, width int, height int)
ImgGenParamsSetSeed func(params uintptr, seed int64)
ImgGenParamsGetVaeTilingParams func(params uintptr) uintptr
VidGenParamsNew func() uintptr
VidGenParamsSetPrompts func(params uintptr, prompt string, negativePrompt string)
VidGenParamsSetDimensions func(params uintptr, width int, height int)
VidGenParamsSetSeed func(params uintptr, seed int64)
VidGenParamsSetVideoFrames func(params uintptr, n int)
)
// Copied from Purego internal/strings
@ -153,3 +160,58 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
return nil
}
func (sd *SDGGML) GenerateVideo(opts *pb.GenerateVideoRequest) error {
dst := opts.Dst
if dst == "" {
return fmt.Errorf("dst is empty")
}
width := int(opts.Width)
height := int(opts.Height)
if width == 0 {
width = 512
}
if height == 0 {
height = 512
}
numFrames := int(opts.NumFrames)
if numFrames <= 0 {
numFrames = 16
}
fps := int(opts.Fps)
if fps <= 0 {
fps = 16
}
steps := int(opts.Step)
if steps <= 0 {
steps = 20
}
cfg := opts.CfgScale
if cfg == 0 {
cfg = sd.cfgScale
}
if cfg == 0 {
cfg = 5.0
}
// sd_vid_gen_params_new allocates; gen_video frees it after the generation call.
p := VidGenParamsNew()
VidGenParamsSetPrompts(p, opts.Prompt, opts.NegativePrompt)
VidGenParamsSetDimensions(p, width, height)
VidGenParamsSetSeed(p, int64(opts.Seed))
VidGenParamsSetVideoFrames(p, numFrames)
fmt.Fprintf(os.Stderr, "GenerateVideo: dst=%s size=%dx%d frames=%d fps=%d steps=%d cfg=%.2f\n",
dst, width, height, numFrames, fps, steps, cfg)
ret := GenVideo(p, steps, dst, cfg, fps, opts.StartImage, opts.EndImage)
if ret != 0 {
return fmt.Errorf("video inference failed (code %d)", ret)
}
return nil
}

View file

@ -18,6 +18,13 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);
int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count);
sd_vid_gen_params_t* sd_vid_gen_params_new(void);
void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt);
void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height);
void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed);
void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n);
int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image);
#ifdef __cplusplus
}
#endif

View file

@ -32,6 +32,7 @@ func main() {
libFuncs := []LibFuncs{
{&LoadModel, "load_model"},
{&GenImage, "gen_image"},
{&GenVideo, "gen_video"},
{&TilingParamsSetEnabled, "sd_tiling_params_set_enabled"},
{&TilingParamsSetTileSizes, "sd_tiling_params_set_tile_sizes"},
{&TilingParamsSetRelSizes, "sd_tiling_params_set_rel_sizes"},
@ -42,6 +43,12 @@ func main() {
{&ImgGenParamsSetDimensions, "sd_img_gen_params_set_dimensions"},
{&ImgGenParamsSetSeed, "sd_img_gen_params_set_seed"},
{&ImgGenParamsGetVaeTilingParams, "sd_img_gen_params_get_vae_tiling_params"},
{&VidGenParamsNew, "sd_vid_gen_params_new"},
{&VidGenParamsSetPrompts, "sd_vid_gen_params_set_prompts"},
{&VidGenParamsSetDimensions, "sd_vid_gen_params_set_dimensions"},
{&VidGenParamsSetSeed, "sd_vid_gen_params_set_seed"},
{&VidGenParamsSetVideoFrames, "sd_vid_gen_params_set_video_frames"},
}
for _, lf := range libFuncs {

View file

@ -80,51 +80,65 @@ func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
return echo.ErrBadRequest
}
src := ""
if input.StartImage != "" {
// Stage a base64- or URL-provided image into a temp file so the
// backend can read it as a path. Used for both start_image and
// (optional) end_image. Returns the temp file path, or "" if the
// input is empty. Caller is responsible for the defer-cleanup.
stageImage := func(ref string) (string, error) {
if ref == "" {
return "", nil
}
var fileData []byte
var err error
// check if input.File is an URL, if so download it and save it
// to a temporary file
if strings.HasPrefix(input.StartImage, "http://") || strings.HasPrefix(input.StartImage, "https://") {
out, err := downloadFile(input.StartImage)
if err != nil {
return fmt.Errorf("failed downloading file:%w", err)
if strings.HasPrefix(ref, "http://") || strings.HasPrefix(ref, "https://") {
out, derr := downloadFile(ref)
if derr != nil {
return "", fmt.Errorf("failed downloading file: %w", derr)
}
defer os.RemoveAll(out)
fileData, err = os.ReadFile(out)
if err != nil {
return fmt.Errorf("failed reading file:%w", err)
return "", fmt.Errorf("failed reading file: %w", err)
}
} else {
// base 64 decode the file and write it somewhere
// that we will cleanup
fileData, err = base64.StdEncoding.DecodeString(input.StartImage)
fileData, err = base64.StdEncoding.DecodeString(ref)
if err != nil {
return err
return "", err
}
}
// Create a temporary file
outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
if err != nil {
return err
return "", err
}
// write the base64 result
writer := bufio.NewWriter(outputFile)
_, err = writer.Write(fileData)
if err != nil {
if _, err := writer.Write(fileData); err != nil {
outputFile.Close()
return err
return "", err
}
if err := writer.Flush(); err != nil {
outputFile.Close()
return "", err
}
outputFile.Close()
src = outputFile.Name()
return outputFile.Name(), nil
}
src, err := stageImage(input.StartImage)
if err != nil {
return err
}
if src != "" {
defer os.RemoveAll(src)
}
endSrc, err := stageImage(input.EndImage)
if err != nil {
return err
}
if endSrc != "" {
defer os.RemoveAll(endSrc)
}
xlog.Debug("Parameter Config", "config", config)
switch config.Backend {
@ -184,7 +198,7 @@ func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
input.Prompt,
input.NegativePrompt,
src,
input.EndImage,
endSrc,
output,
input.NumFrames,
input.FPS,

View file

@ -15167,6 +15167,62 @@
- sd-3
- gpu
url: "github:mudler/LocalAI/gallery/stablediffusion3.yaml@master"
- name: wan-2.1-t2v-1.3b-ggml
license: apache-2.0
url: "github:mudler/LocalAI/gallery/wan-ggml.yaml@master"
description: |
Wan 2.1 T2V 1.3B — text-to-video diffusion model, GGUF-quantized for the
stable-diffusion.cpp backend. Generates short (33-frame) 832x480 clips
from a text prompt. Cheapest Wan variant, suitable for CPU-offloaded
inference with ~10 GB of usable RAM.
urls:
- https://huggingface.co/calcuis/wan-gguf
- https://huggingface.co/city96/umt5-xxl-encoder-gguf
tags:
- text-to-video
- wan
- video-generation
- cpu
- gpu
overrides:
parameters:
model: wan2.1-t2v-1.3B-Q8_0.gguf
files:
- filename: "wan2.1-t2v-1.3B-Q8_0.gguf"
uri: "huggingface://calcuis/wan-gguf/wan2.1-t2v-1.3B-Q8_0.gguf"
- filename: "wan_2.1_vae.safetensors"
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors"
- filename: "umt5-xxl-encoder-Q8_0.gguf"
uri: "huggingface://city96/umt5-xxl-encoder-gguf/umt5-xxl-encoder-Q8_0.gguf"
- name: wan-2.1-i2v-14b-480p-ggml
license: apache-2.0
url: "github:mudler/LocalAI/gallery/wan-ggml.yaml@master"
description: |
Wan 2.1 I2V 14B 480P — image-to-video diffusion, GGUF Q4 quantization.
Animates a reference image into a 33-frame 480p clip. Requires more
RAM than the 1.3B T2V variant; CPU offload enabled by default.
urls:
- https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf
tags:
- image-to-video
- wan
- video-generation
- cpu
- gpu
overrides:
parameters:
model: wan2.1-i2v-14b-480p-Q4_K_M.gguf
options:
- "clip_vision_path:clip_vision_h.safetensors"
files:
- filename: "wan2.1-i2v-14b-480p-Q4_K_M.gguf"
uri: "huggingface://city96/Wan2.1-I2V-14B-480P-gguf/wan2.1-i2v-14b-480p-Q4_K_M.gguf"
- filename: "wan_2.1_vae.safetensors"
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors"
- filename: "umt5-xxl-encoder-Q8_0.gguf"
uri: "huggingface://city96/umt5-xxl-encoder-gguf/umt5-xxl-encoder-Q8_0.gguf"
- filename: "clip_vision_h.safetensors"
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/clip_vision/clip_vision_h.safetensors"
- name: sd-1.5-ggml
icon: https://avatars.githubusercontent.com/u/37351293
license: creativeml-openrail-m

19
gallery/wan-ggml.yaml Normal file
View file

@ -0,0 +1,19 @@
---
name: "wan-ggml"
config_file: |
backend: stablediffusion-ggml
step: 20
cfg_scale: 6.0
options:
- "diffusion_model"
- "vae_decode_only:false"
- "sampler:euler"
- "scheduler:discrete"
- "flow_shift:3.0"
- "diffusion_flash_attn:true"
- "offload_params_to_cpu:true"
- "keep_vae_on_cpu:true"
- "keep_clip_on_cpu:true"
- "t5xxl_path:umt5-xxl-encoder-Q8_0.gguf"
- "vae_path:wan_2.1_vae.safetensors"