mirror of
https://github.com/mudler/LocalAI
synced 2026-04-21 13:27:21 +00:00
feat: Add Kokoros backend (#9212)
Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
parent
b7247fc148
commit
ea6e850809
19 changed files with 4180 additions and 2 deletions
23
.github/workflows/test-extra.yml
vendored
23
.github/workflows/test-extra.yml
vendored
|
|
@ -31,6 +31,7 @@ jobs:
|
||||||
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
|
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
|
||||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||||
|
kokoros: ${{ steps.detect.outputs.kokoros }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v6
|
||||||
|
|
@ -528,3 +529,25 @@ jobs:
|
||||||
- name: Test voxtral
|
- name: Test voxtral
|
||||||
run: |
|
run: |
|
||||||
make --jobs=5 --output-sync=target -C backend/go/voxtral test
|
make --jobs=5 --output-sync=target -C backend/go/voxtral test
|
||||||
|
tests-kokoros:
|
||||||
|
needs: detect-changes
|
||||||
|
if: needs.detect-changes.outputs.kokoros == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y build-essential cmake pkg-config protobuf-compiler clang libclang-dev
|
||||||
|
sudo apt-get install -y espeak-ng libespeak-ng-dev libsonic-dev libpcaudio-dev libopus-dev libssl-dev
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||||
|
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||||
|
- name: Build kokoros
|
||||||
|
run: |
|
||||||
|
make -C backend/rust/kokoros kokoros-grpc
|
||||||
|
- name: Test kokoros
|
||||||
|
run: |
|
||||||
|
make -C backend/rust/kokoros test
|
||||||
|
|
|
||||||
3
.gitmodules
vendored
3
.gitmodules
vendored
|
|
@ -1,3 +1,6 @@
|
||||||
[submodule "docs/themes/hugo-theme-relearn"]
|
[submodule "docs/themes/hugo-theme-relearn"]
|
||||||
path = docs/themes/hugo-theme-relearn
|
path = docs/themes/hugo-theme-relearn
|
||||||
url = https://github.com/McShelby/hugo-theme-relearn.git
|
url = https://github.com/McShelby/hugo-theme-relearn.git
|
||||||
|
[submodule "backend/rust/kokoros/sources/Kokoros"]
|
||||||
|
path = backend/rust/kokoros/sources/Kokoros
|
||||||
|
url = https://github.com/lucasjinreal/Kokoros
|
||||||
|
|
|
||||||
10
Makefile
10
Makefile
|
|
@ -1,5 +1,5 @@
|
||||||
# Disable parallel execution for backend builds
|
# Disable parallel execution for backend builds
|
||||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization
|
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros
|
||||||
|
|
||||||
GOCMD=go
|
GOCMD=go
|
||||||
GOTEST=$(GOCMD) test
|
GOTEST=$(GOCMD) test
|
||||||
|
|
@ -431,6 +431,7 @@ prepare-test-extra: protogen-python
|
||||||
$(MAKE) -C backend/python/whisperx
|
$(MAKE) -C backend/python/whisperx
|
||||||
$(MAKE) -C backend/python/ace-step
|
$(MAKE) -C backend/python/ace-step
|
||||||
$(MAKE) -C backend/python/trl
|
$(MAKE) -C backend/python/trl
|
||||||
|
$(MAKE) -C backend/rust/kokoros kokoros-grpc
|
||||||
|
|
||||||
test-extra: prepare-test-extra
|
test-extra: prepare-test-extra
|
||||||
$(MAKE) -C backend/python/transformers test
|
$(MAKE) -C backend/python/transformers test
|
||||||
|
|
@ -451,6 +452,7 @@ test-extra: prepare-test-extra
|
||||||
$(MAKE) -C backend/python/whisperx test
|
$(MAKE) -C backend/python/whisperx test
|
||||||
$(MAKE) -C backend/python/ace-step test
|
$(MAKE) -C backend/python/ace-step test
|
||||||
$(MAKE) -C backend/python/trl test
|
$(MAKE) -C backend/python/trl test
|
||||||
|
$(MAKE) -C backend/rust/kokoros test
|
||||||
|
|
||||||
DOCKER_IMAGE?=local-ai
|
DOCKER_IMAGE?=local-ai
|
||||||
IMAGE_TYPE?=core
|
IMAGE_TYPE?=core
|
||||||
|
|
@ -586,6 +588,9 @@ BACKEND_MLX_DISTRIBUTED = mlx-distributed|python|./|false|true
|
||||||
BACKEND_TRL = trl|python|.|false|true
|
BACKEND_TRL = trl|python|.|false|true
|
||||||
BACKEND_LLAMA_CPP_QUANTIZATION = llama-cpp-quantization|python|.|false|true
|
BACKEND_LLAMA_CPP_QUANTIZATION = llama-cpp-quantization|python|.|false|true
|
||||||
|
|
||||||
|
# Rust backends
|
||||||
|
BACKEND_KOKOROS = kokoros|rust|.|false|true
|
||||||
|
|
||||||
# Helper function to build docker image for a backend
|
# Helper function to build docker image for a backend
|
||||||
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
|
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
|
||||||
define docker-build-backend
|
define docker-build-backend
|
||||||
|
|
@ -644,12 +649,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_TRL)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_TRL)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION)))
|
||||||
|
$(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
|
||||||
|
|
||||||
# Pattern rule for docker-save targets
|
# Pattern rule for docker-save targets
|
||||||
docker-save-%: backend-images
|
docker-save-%: backend-images
|
||||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||||
|
|
||||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization
|
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros
|
||||||
|
|
||||||
########################################################
|
########################################################
|
||||||
### Mock Backend for E2E Tests
|
### Mock Backend for E2E Tests
|
||||||
|
|
|
||||||
39
backend/Dockerfile.rust
Normal file
39
backend/Dockerfile.rust
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
ARG BASE_IMAGE=ubuntu:24.04
|
||||||
|
|
||||||
|
FROM ${BASE_IMAGE} AS builder
|
||||||
|
ARG BACKEND=kokoros
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ARG TARGETARCH
|
||||||
|
ARG TARGETVARIANT
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
git ccache \
|
||||||
|
ca-certificates \
|
||||||
|
make cmake wget \
|
||||||
|
curl unzip \
|
||||||
|
clang \
|
||||||
|
pkg-config \
|
||||||
|
libssl-dev \
|
||||||
|
espeak-ng libespeak-ng-dev \
|
||||||
|
libsonic-dev libpcaudio-dev \
|
||||||
|
libopus-dev \
|
||||||
|
protobuf-compiler && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Rust
|
||||||
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
|
||||||
|
COPY . /LocalAI
|
||||||
|
|
||||||
|
RUN git config --global --add safe.directory /LocalAI
|
||||||
|
|
||||||
|
RUN make -C /LocalAI/backend/rust/${BACKEND} build
|
||||||
|
|
||||||
|
FROM scratch
|
||||||
|
ARG BACKEND=kokoros
|
||||||
|
|
||||||
|
COPY --from=builder /LocalAI/backend/rust/${BACKEND}/package/. ./
|
||||||
|
|
@ -469,6 +469,26 @@
|
||||||
nvidia-cuda-13: "cuda13-kokoro"
|
nvidia-cuda-13: "cuda13-kokoro"
|
||||||
nvidia-cuda-12: "cuda12-kokoro"
|
nvidia-cuda-12: "cuda12-kokoro"
|
||||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
|
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
|
||||||
|
- &kokoros
|
||||||
|
icon: https://avatars.githubusercontent.com/u/166769057?v=4
|
||||||
|
description: |
|
||||||
|
Kokoros is a pure Rust TTS backend using the Kokoro ONNX model (82M parameters).
|
||||||
|
It provides fast, high-quality text-to-speech with streaming support, built on
|
||||||
|
ONNX Runtime for efficient CPU inference. Supports English, Japanese, Mandarin
|
||||||
|
Chinese, and German.
|
||||||
|
urls:
|
||||||
|
- https://huggingface.co/hexgrad/Kokoro-82M
|
||||||
|
- https://github.com/lucasjinreal/Kokoros
|
||||||
|
tags:
|
||||||
|
- text-to-speech
|
||||||
|
- TTS
|
||||||
|
- Rust
|
||||||
|
- ONNX
|
||||||
|
license: apache-2.0
|
||||||
|
alias: "kokoros"
|
||||||
|
name: "kokoros"
|
||||||
|
capabilities:
|
||||||
|
default: "cpu-kokoros"
|
||||||
- &coqui
|
- &coqui
|
||||||
urls:
|
urls:
|
||||||
- https://github.com/idiap/coqui-ai-TTS
|
- https://github.com/idiap/coqui-ai-TTS
|
||||||
|
|
@ -2043,6 +2063,21 @@
|
||||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kokoro"
|
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kokoro"
|
||||||
mirrors:
|
mirrors:
|
||||||
- localai/localai-backends:master-metal-darwin-arm64-kokoro
|
- localai/localai-backends:master-metal-darwin-arm64-kokoro
|
||||||
|
## kokoros (Rust)
|
||||||
|
- !!merge <<: *kokoros
|
||||||
|
name: "kokoros-development"
|
||||||
|
capabilities:
|
||||||
|
default: "cpu-kokoros-development"
|
||||||
|
- !!merge <<: *kokoros
|
||||||
|
name: "cpu-kokoros"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-kokoros"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-cpu-kokoros
|
||||||
|
- !!merge <<: *kokoros
|
||||||
|
name: "cpu-kokoros-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-kokoros"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-cpu-kokoros
|
||||||
## faster-whisper
|
## faster-whisper
|
||||||
- !!merge <<: *faster-whisper
|
- !!merge <<: *faster-whisper
|
||||||
name: "faster-whisper-development"
|
name: "faster-whisper-development"
|
||||||
|
|
|
||||||
3
backend/rust/kokoros/.gitignore
vendored
Normal file
3
backend/rust/kokoros/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
/target/
|
||||||
|
/proto/
|
||||||
|
/package/
|
||||||
3074
backend/rust/kokoros/Cargo.lock
generated
Normal file
3074
backend/rust/kokoros/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
26
backend/rust/kokoros/Cargo.toml
Normal file
26
backend/rust/kokoros/Cargo.toml
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
[package]
|
||||||
|
name = "kokoros-grpc"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "kokoros-grpc"
|
||||||
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
kokoros = { path = "sources/Kokoros/kokoros" }
|
||||||
|
|
||||||
|
tonic = "0.13"
|
||||||
|
prost = "0.13"
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
tokio-stream = "0.1"
|
||||||
|
clap = { version = "4", features = ["derive"] }
|
||||||
|
tracing = "0.1"
|
||||||
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
tonic-build = "0.13"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["cpu"]
|
||||||
|
cpu = ["kokoros/cpu"]
|
||||||
25
backend/rust/kokoros/Makefile
Normal file
25
backend/rust/kokoros/Makefile
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
CURRENT_DIR=$(abspath ./)
|
||||||
|
|
||||||
|
.PHONY: kokoros-grpc
|
||||||
|
kokoros-grpc:
|
||||||
|
mkdir -p $(CURRENT_DIR)/proto
|
||||||
|
cp $(CURRENT_DIR)/../../backend.proto $(CURRENT_DIR)/proto/backend.proto
|
||||||
|
cd $(CURRENT_DIR) && \
|
||||||
|
BACKEND_PROTO_PATH=$(CURRENT_DIR)/proto/backend.proto \
|
||||||
|
cargo build --release
|
||||||
|
|
||||||
|
.PHONY: package
|
||||||
|
package:
|
||||||
|
bash package.sh
|
||||||
|
|
||||||
|
.PHONY: test
|
||||||
|
test: kokoros-grpc
|
||||||
|
cd $(CURRENT_DIR) && cargo test
|
||||||
|
|
||||||
|
.PHONY: build
|
||||||
|
build: kokoros-grpc package
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
cargo clean
|
||||||
|
rm -rf package proto
|
||||||
15
backend/rust/kokoros/build.rs
Normal file
15
backend/rust/kokoros/build.rs
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let proto_path = std::env::var("BACKEND_PROTO_PATH")
|
||||||
|
.unwrap_or_else(|_| "proto/backend.proto".to_string());
|
||||||
|
|
||||||
|
let proto_dir = std::path::Path::new(&proto_path)
|
||||||
|
.parent()
|
||||||
|
.unwrap_or(std::path::Path::new("."));
|
||||||
|
|
||||||
|
tonic_build::configure()
|
||||||
|
.build_server(true)
|
||||||
|
.build_client(false)
|
||||||
|
.compile_protos(&[&proto_path], &[proto_dir])?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
42
backend/rust/kokoros/package.sh
Normal file
42
backend/rust/kokoros/package.sh
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath $0)")
|
||||||
|
mkdir -p $CURDIR/package/lib
|
||||||
|
|
||||||
|
# Copy the binary and run script
|
||||||
|
cp -avf $CURDIR/target/release/kokoros-grpc $CURDIR/package/
|
||||||
|
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||||
|
chmod +x $CURDIR/package/run.sh
|
||||||
|
|
||||||
|
# Copy espeak-ng data
|
||||||
|
if [ -d "/usr/share/espeak-ng-data" ]; then
|
||||||
|
cp -rf /usr/share/espeak-ng-data $CURDIR/package/
|
||||||
|
elif [ -d "/usr/lib/x86_64-linux-gnu/espeak-ng-data" ]; then
|
||||||
|
cp -rf /usr/lib/x86_64-linux-gnu/espeak-ng-data $CURDIR/package/
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Bundle all dynamic library dependencies
|
||||||
|
echo "Bundling dynamic library dependencies..."
|
||||||
|
ldd $CURDIR/target/release/kokoros-grpc | grep "=>" | awk '{print $3}' | while read lib; do
|
||||||
|
if [ -n "$lib" ] && [ -f "$lib" ]; then
|
||||||
|
cp -avfL "$lib" $CURDIR/package/lib/
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Copy CA certificates for HTTPS (needed for model auto-download)
|
||||||
|
if [ -d "/etc/ssl/certs" ]; then
|
||||||
|
mkdir -p $CURDIR/package/etc/ssl
|
||||||
|
cp -rf /etc/ssl/certs $CURDIR/package/etc/ssl/
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Copy the dynamic linker
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
|
ls -liah $CURDIR/package/
|
||||||
|
ls -liah $CURDIR/package/lib/
|
||||||
23
backend/rust/kokoros/run.sh
Executable file
23
backend/rust/kokoros/run.sh
Executable file
|
|
@ -0,0 +1,23 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath $0)")
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$CURDIR/lib:${LD_LIBRARY_PATH:-}
|
||||||
|
|
||||||
|
# SSL certificates for model auto-download
|
||||||
|
if [ -d "$CURDIR/etc/ssl/certs" ]; then
|
||||||
|
export SSL_CERT_DIR=$CURDIR/etc/ssl/certs
|
||||||
|
fi
|
||||||
|
|
||||||
|
# espeak-ng data directory
|
||||||
|
if [ -d "$CURDIR/espeak-ng-data" ]; then
|
||||||
|
export ESPEAK_NG_DATA=$CURDIR/espeak-ng-data
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Use bundled ld.so if present (portability)
|
||||||
|
if [ -f $CURDIR/lib/ld.so ]; then
|
||||||
|
exec $CURDIR/lib/ld.so $CURDIR/kokoros-grpc "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec $CURDIR/kokoros-grpc "$@"
|
||||||
1
backend/rust/kokoros/sources/Kokoros
Submodule
1
backend/rust/kokoros/sources/Kokoros
Submodule
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 7089168f0ca2d8e1fcd8e523c9d75d915c6afdff
|
||||||
26
backend/rust/kokoros/src/auth.rs
Normal file
26
backend/rust/kokoros/src/auth.rs
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
use tonic::{Request, Status};
|
||||||
|
|
||||||
|
/// Returns an interceptor function if LOCALAI_GRPC_AUTH_TOKEN is set.
|
||||||
|
pub fn make_auth_interceptor(
|
||||||
|
) -> Option<impl Fn(Request<()>) -> Result<Request<()>, Status> + Clone> {
|
||||||
|
let token = std::env::var("LOCALAI_GRPC_AUTH_TOKEN").ok()?;
|
||||||
|
if token.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let expected = format!("Bearer {}", token);
|
||||||
|
Some(
|
||||||
|
move |req: Request<()>| -> Result<Request<()>, Status> {
|
||||||
|
let meta = req.metadata();
|
||||||
|
match meta.get("authorization") {
|
||||||
|
Some(val) => {
|
||||||
|
if val.as_bytes() == expected.as_bytes() {
|
||||||
|
Ok(req)
|
||||||
|
} else {
|
||||||
|
Err(Status::unauthenticated("invalid token"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => Err(Status::unauthenticated("missing authorization")),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
53
backend/rust/kokoros/src/main.rs
Normal file
53
backend/rust/kokoros/src/main.rs
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
use clap::Parser;
|
||||||
|
use tonic::transport::Server;
|
||||||
|
|
||||||
|
mod auth;
|
||||||
|
mod service;
|
||||||
|
|
||||||
|
pub mod backend {
|
||||||
|
tonic::include_proto!("backend");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(name = "kokoros-grpc")]
|
||||||
|
struct Cli {
|
||||||
|
/// gRPC listen address (host:port)
|
||||||
|
#[arg(long, default_value = "localhost:50051")]
|
||||||
|
addr: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
tracing_subscriber::fmt()
|
||||||
|
.with_writer(std::io::stderr)
|
||||||
|
.with_ansi(false)
|
||||||
|
.without_time()
|
||||||
|
.with_env_filter(
|
||||||
|
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||||
|
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
|
||||||
|
)
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let cli = Cli::parse();
|
||||||
|
let addr = cli.addr.parse()?;
|
||||||
|
|
||||||
|
tracing::info!("Starting kokoros gRPC server on {}", addr);
|
||||||
|
|
||||||
|
let mut builder = Server::builder();
|
||||||
|
|
||||||
|
if let Some(interceptor) = auth::make_auth_interceptor() {
|
||||||
|
tracing::info!("Bearer token authentication enabled");
|
||||||
|
let svc = backend::backend_server::BackendServer::with_interceptor(
|
||||||
|
service::KokorosService::default(),
|
||||||
|
interceptor,
|
||||||
|
);
|
||||||
|
builder.add_service(svc).serve(addr).await?;
|
||||||
|
} else {
|
||||||
|
let svc = backend::backend_server::BackendServer::new(service::KokorosService::default())
|
||||||
|
.max_decoding_message_size(50 * 1024 * 1024)
|
||||||
|
.max_encoding_message_size(50 * 1024 * 1024);
|
||||||
|
builder.add_service(svc).serve(addr).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
652
backend/rust/kokoros/src/service.rs
Normal file
652
backend/rust/kokoros/src/service.rs
Normal file
|
|
@ -0,0 +1,652 @@
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use tokio::sync::Mutex as TokioMutex;
|
||||||
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
|
use tonic::{Request, Response, Status};
|
||||||
|
|
||||||
|
use kokoros::tts::koko::TTSKoko;
|
||||||
|
|
||||||
|
use crate::backend;
|
||||||
|
use crate::backend::backend_server::Backend;
|
||||||
|
|
||||||
|
/// Write f32 samples as a standard 44-byte PCM 16-bit WAV file.
|
||||||
|
/// LocalAI's audio pipeline assumes this exact header layout.
|
||||||
|
fn write_pcm16_wav(
|
||||||
|
path: &str,
|
||||||
|
samples: &[f32],
|
||||||
|
sample_rate: u32,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::Write;
|
||||||
|
|
||||||
|
let num_samples = samples.len() as u32;
|
||||||
|
let data_size = num_samples * 2; // 16-bit = 2 bytes per sample
|
||||||
|
let file_size = 36 + data_size;
|
||||||
|
|
||||||
|
let mut f = File::create(path)?;
|
||||||
|
|
||||||
|
// RIFF header
|
||||||
|
f.write_all(b"RIFF")?;
|
||||||
|
f.write_all(&file_size.to_le_bytes())?;
|
||||||
|
f.write_all(b"WAVE")?;
|
||||||
|
|
||||||
|
// fmt chunk — standard 16-byte PCM format
|
||||||
|
f.write_all(b"fmt ")?;
|
||||||
|
f.write_all(&16u32.to_le_bytes())?; // chunk size
|
||||||
|
f.write_all(&1u16.to_le_bytes())?; // audio format = PCM
|
||||||
|
f.write_all(&1u16.to_le_bytes())?; // channels = mono
|
||||||
|
f.write_all(&sample_rate.to_le_bytes())?;
|
||||||
|
f.write_all(&(sample_rate * 2).to_le_bytes())?; // byte rate
|
||||||
|
f.write_all(&2u16.to_le_bytes())?; // block align
|
||||||
|
f.write_all(&16u16.to_le_bytes())?; // bits per sample
|
||||||
|
|
||||||
|
// data chunk
|
||||||
|
f.write_all(b"data")?;
|
||||||
|
f.write_all(&data_size.to_le_bytes())?;
|
||||||
|
|
||||||
|
for &s in samples {
|
||||||
|
let clamped = s.clamp(-1.0, 1.0);
|
||||||
|
let pcm = (clamped * 32767.0) as i16;
|
||||||
|
f.write_all(&pcm.to_le_bytes())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct KokorosService {
|
||||||
|
tts: Arc<TokioMutex<Option<TTSKoko>>>,
|
||||||
|
language: Arc<Mutex<String>>,
|
||||||
|
speed: Arc<Mutex<f32>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for KokorosService {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
tts: Arc::new(TokioMutex::new(None)),
|
||||||
|
language: Arc::new(Mutex::new("en-us".to_string())),
|
||||||
|
speed: Arc::new(Mutex::new(1.0)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tonic::async_trait]
|
||||||
|
impl Backend for KokorosService {
|
||||||
|
async fn health(
|
||||||
|
&self,
|
||||||
|
_req: Request<backend::HealthMessage>,
|
||||||
|
) -> Result<Response<backend::Reply>, Status> {
|
||||||
|
Ok(Response::new(backend::Reply {
|
||||||
|
message: b"OK".to_vec(),
|
||||||
|
..Default::default()
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_model(
|
||||||
|
&self,
|
||||||
|
req: Request<backend::ModelOptions>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
let opts = req.into_inner();
|
||||||
|
|
||||||
|
// Model path: join ModelPath + Model, or just Model
|
||||||
|
let model_path = if !opts.model_path.is_empty() && !opts.model.is_empty() {
|
||||||
|
format!("{}/{}", opts.model_path, opts.model)
|
||||||
|
} else if !opts.model.is_empty() {
|
||||||
|
opts.model.clone()
|
||||||
|
} else {
|
||||||
|
"checkpoints/kokoro-v1.0.onnx".to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Voices data path from AudioPath, or derive from model dir
|
||||||
|
let voices_path = if !opts.audio_path.is_empty() {
|
||||||
|
opts.audio_path.clone()
|
||||||
|
} else {
|
||||||
|
let model_dir = std::path::Path::new(&model_path)
|
||||||
|
.parent()
|
||||||
|
.map(|p| p.to_string_lossy().to_string())
|
||||||
|
.unwrap_or_else(|| ".".to_string());
|
||||||
|
format!("{}/voices-v1.0.bin", model_dir)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse options (key:value pairs)
|
||||||
|
for opt in &opts.options {
|
||||||
|
if let Some((key, value)) = opt.split_once(':') {
|
||||||
|
match key {
|
||||||
|
"lang_code" => *self.language.lock().unwrap() = value.to_string(),
|
||||||
|
"speed" => {
|
||||||
|
if let Ok(s) = value.parse::<f32>() {
|
||||||
|
*self.speed.lock().unwrap() = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!("Loading Kokoros model from: {}", model_path);
|
||||||
|
tracing::info!("Loading voices from: {}", voices_path);
|
||||||
|
tracing::info!("Language: {}", self.language.lock().unwrap());
|
||||||
|
|
||||||
|
let tts = TTSKoko::new(&model_path, &voices_path).await;
|
||||||
|
*self.tts.lock().await = Some(tts);
|
||||||
|
|
||||||
|
tracing::info!("Kokoros TTS model loaded successfully");
|
||||||
|
Ok(Response::new(backend::Result {
|
||||||
|
success: true,
|
||||||
|
message: "Kokoros TTS model loaded".into(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn tts(
|
||||||
|
&self,
|
||||||
|
req: Request<backend::TtsRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
let req = req.into_inner();
|
||||||
|
let tts_guard = self.tts.lock().await;
|
||||||
|
let tts = tts_guard
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?;
|
||||||
|
|
||||||
|
let voice = if req.voice.is_empty() {
|
||||||
|
"af_heart"
|
||||||
|
} else {
|
||||||
|
&req.voice
|
||||||
|
};
|
||||||
|
let lang = req
|
||||||
|
.language
|
||||||
|
.filter(|l| !l.is_empty())
|
||||||
|
.unwrap_or_else(|| self.language.lock().unwrap().clone());
|
||||||
|
let speed = *self.speed.lock().unwrap();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
text = req.text,
|
||||||
|
voice = voice,
|
||||||
|
lang = lang.as_str(),
|
||||||
|
dst = req.dst,
|
||||||
|
"TTS request received"
|
||||||
|
);
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
match tts.tts_raw_audio(&req.text, &lang, voice, speed, None, None, None, None) {
|
||||||
|
Ok(samples) => {
|
||||||
|
let duration_secs = samples.len() as f64 / 24000.0;
|
||||||
|
tracing::info!(
|
||||||
|
num_samples = samples.len(),
|
||||||
|
audio_duration = format!("{:.2}s", duration_secs),
|
||||||
|
inference_time = format!("{:.2}s", start.elapsed().as_secs_f64()),
|
||||||
|
dst = req.dst,
|
||||||
|
"TTS inference complete"
|
||||||
|
);
|
||||||
|
if let Err(e) = write_pcm16_wav(&req.dst, &samples, 24000) {
|
||||||
|
tracing::error!("Failed to write WAV to {}: {}", req.dst, e);
|
||||||
|
return Ok(Response::new(backend::Result {
|
||||||
|
success: false,
|
||||||
|
message: format!("Failed to write WAV: {}", e),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
Ok(Response::new(backend::Result {
|
||||||
|
success: true,
|
||||||
|
message: String::new(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("TTS error: {}", e);
|
||||||
|
Ok(Response::new(backend::Result {
|
||||||
|
success: false,
|
||||||
|
message: format!("TTS error: {}", e),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type TTSStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
|
||||||
|
|
||||||
|
async fn tts_stream(
|
||||||
|
&self,
|
||||||
|
req: Request<backend::TtsRequest>,
|
||||||
|
) -> Result<Response<Self::TTSStreamStream>, Status> {
|
||||||
|
let req = req.into_inner();
|
||||||
|
let tts_guard = self.tts.lock().await;
|
||||||
|
let tts = tts_guard
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?
|
||||||
|
.clone();
|
||||||
|
|
||||||
|
let voice = if req.voice.is_empty() {
|
||||||
|
"af_heart".to_string()
|
||||||
|
} else {
|
||||||
|
req.voice
|
||||||
|
};
|
||||||
|
let lang = req
|
||||||
|
.language
|
||||||
|
.filter(|l| !l.is_empty())
|
||||||
|
.unwrap_or_else(|| self.language.lock().unwrap().clone());
|
||||||
|
let speed = *self.speed.lock().unwrap();
|
||||||
|
let text = req.text;
|
||||||
|
|
||||||
|
let (tx, rx) = tokio::sync::mpsc::channel(32);
|
||||||
|
|
||||||
|
// Send sample rate info as first message
|
||||||
|
let tx_clone = tx.clone();
|
||||||
|
let _ = tx_clone
|
||||||
|
.send(Ok(backend::Reply {
|
||||||
|
message: br#"{"sample_rate":24000}"#.to_vec(),
|
||||||
|
..Default::default()
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
|
||||||
|
tokio::task::spawn_blocking(move || {
|
||||||
|
let result = tts.tts_raw_audio_streaming(
|
||||||
|
&text,
|
||||||
|
&lang,
|
||||||
|
&voice,
|
||||||
|
speed,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
|audio_chunk: Vec<f32>| -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
// Convert f32 PCM to 16-bit PCM bytes (what LocalAI expects for streaming)
|
||||||
|
let bytes: Vec<u8> = audio_chunk
|
||||||
|
.iter()
|
||||||
|
.flat_map(|&s| {
|
||||||
|
let clamped = s.clamp(-1.0, 1.0);
|
||||||
|
let i16_val = (clamped * 32767.0) as i16;
|
||||||
|
i16_val.to_le_bytes()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
tx.blocking_send(Ok(backend::Reply {
|
||||||
|
audio: bytes,
|
||||||
|
..Default::default()
|
||||||
|
}))
|
||||||
|
.map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
|
||||||
|
},
|
||||||
|
);
|
||||||
|
if let Err(e) = result {
|
||||||
|
tracing::error!("TTSStream error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(Response::new(ReceiverStream::new(rx)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn status(
|
||||||
|
&self,
|
||||||
|
_req: Request<backend::HealthMessage>,
|
||||||
|
) -> Result<Response<backend::StatusResponse>, Status> {
|
||||||
|
let tts = self.tts.lock().await;
|
||||||
|
let state = if tts.is_some() {
|
||||||
|
backend::status_response::State::Ready as i32
|
||||||
|
} else {
|
||||||
|
backend::status_response::State::Uninitialized as i32
|
||||||
|
};
|
||||||
|
Ok(Response::new(backend::StatusResponse {
|
||||||
|
state,
|
||||||
|
memory: None,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn free(
|
||||||
|
&self,
|
||||||
|
_req: Request<backend::HealthMessage>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
*self.tts.lock().await = None;
|
||||||
|
Ok(Response::new(backend::Result {
|
||||||
|
success: true,
|
||||||
|
message: "Model freed".into(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Unimplemented RPCs ---
|
||||||
|
|
||||||
|
async fn predict(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::PredictOptions>,
|
||||||
|
) -> Result<Response<backend::Reply>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
type PredictStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
|
||||||
|
|
||||||
|
async fn predict_stream(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::PredictOptions>,
|
||||||
|
) -> Result<Response<Self::PredictStreamStream>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn embedding(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::PredictOptions>,
|
||||||
|
) -> Result<Response<backend::EmbeddingResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn generate_image(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::GenerateImageRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn generate_video(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::GenerateVideoRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn audio_transcription(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::TranscriptRequest>,
|
||||||
|
) -> Result<Response<backend::TranscriptResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn sound_generation(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::SoundGenerationRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn tokenize_string(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::PredictOptions>,
|
||||||
|
) -> Result<Response<backend::TokenizationResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn detect(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::DetectOptions>,
|
||||||
|
) -> Result<Response<backend::DetectResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stores_set(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::StoresSetOptions>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stores_delete(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::StoresDeleteOptions>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stores_get(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::StoresGetOptions>,
|
||||||
|
) -> Result<Response<backend::StoresGetResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stores_find(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::StoresFindOptions>,
|
||||||
|
) -> Result<Response<backend::StoresFindResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn rerank(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::RerankRequest>,
|
||||||
|
) -> Result<Response<backend::RerankResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_metrics(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::MetricsRequest>,
|
||||||
|
) -> Result<Response<backend::MetricsResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn vad(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::VadRequest>,
|
||||||
|
) -> Result<Response<backend::VadResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn audio_encode(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::AudioEncodeRequest>,
|
||||||
|
) -> Result<Response<backend::AudioEncodeResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn audio_decode(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::AudioDecodeRequest>,
|
||||||
|
) -> Result<Response<backend::AudioDecodeResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn model_metadata(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::ModelOptions>,
|
||||||
|
) -> Result<Response<backend::ModelMetadataResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn start_fine_tune(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::FineTuneRequest>,
|
||||||
|
) -> Result<Response<backend::FineTuneJobResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
type FineTuneProgressStream = ReceiverStream<Result<backend::FineTuneProgressUpdate, Status>>;
|
||||||
|
|
||||||
|
async fn fine_tune_progress(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::FineTuneProgressRequest>,
|
||||||
|
) -> Result<Response<Self::FineTuneProgressStream>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stop_fine_tune(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::FineTuneStopRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_checkpoints(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::ListCheckpointsRequest>,
|
||||||
|
) -> Result<Response<backend::ListCheckpointsResponse>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn export_model(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::ExportModelRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn start_quantization(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::QuantizationRequest>,
|
||||||
|
) -> Result<Response<backend::QuantizationJobResult>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
type QuantizationProgressStream =
|
||||||
|
ReceiverStream<Result<backend::QuantizationProgressUpdate, Status>>;
|
||||||
|
|
||||||
|
async fn quantization_progress(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::QuantizationProgressRequest>,
|
||||||
|
) -> Result<Response<Self::QuantizationProgressStream>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn stop_quantization(
|
||||||
|
&self,
|
||||||
|
_: Request<backend::QuantizationStopRequest>,
|
||||||
|
) -> Result<Response<backend::Result>, Status> {
|
||||||
|
Err(Status::unimplemented("Not supported"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn wav_header_is_standard_pcm16() {
|
||||||
|
let samples = vec![0.0f32, 0.5, -0.5, 1.0, -1.0];
|
||||||
|
let path = std::env::temp_dir().join("kokoros_test.wav");
|
||||||
|
let path_str = path.to_str().unwrap();
|
||||||
|
|
||||||
|
write_pcm16_wav(path_str, &samples, 24000).unwrap();
|
||||||
|
|
||||||
|
let data = std::fs::read(&path).unwrap();
|
||||||
|
std::fs::remove_file(&path).unwrap();
|
||||||
|
|
||||||
|
// Must be exactly 44-byte header + data
|
||||||
|
assert_eq!(data.len(), 44 + samples.len() * 2);
|
||||||
|
|
||||||
|
// RIFF header
|
||||||
|
assert_eq!(&data[0..4], b"RIFF");
|
||||||
|
assert_eq!(&data[8..12], b"WAVE");
|
||||||
|
|
||||||
|
// fmt chunk: 16 bytes, format=1 (PCM), channels=1, 16-bit
|
||||||
|
assert_eq!(&data[12..16], b"fmt ");
|
||||||
|
assert_eq!(u32::from_le_bytes(data[16..20].try_into().unwrap()), 16); // chunk size
|
||||||
|
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM format
|
||||||
|
assert_eq!(u16::from_le_bytes(data[22..24].try_into().unwrap()), 1); // mono
|
||||||
|
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
|
||||||
|
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // bits per sample
|
||||||
|
|
||||||
|
// data chunk
|
||||||
|
assert_eq!(&data[36..40], b"data");
|
||||||
|
assert_eq!(
|
||||||
|
u32::from_le_bytes(data[40..44].try_into().unwrap()),
|
||||||
|
(samples.len() * 2) as u32
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify sample values: 0.5 -> 16383, -0.5 -> -16383, 1.0 -> 32767, -1.0 -> -32767
|
||||||
|
let s1 = i16::from_le_bytes(data[46..48].try_into().unwrap());
|
||||||
|
assert_eq!(s1, 16383); // 0.5 * 32767
|
||||||
|
let s3 = i16::from_le_bytes(data[50..52].try_into().unwrap());
|
||||||
|
assert_eq!(s3, 32767); // 1.0 clamped
|
||||||
|
let s4 = i16::from_le_bytes(data[52..54].try_into().unwrap());
|
||||||
|
assert_eq!(s4, -32767); // -1.0 clamped
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Integration test: runs actual TTS inference and validates the output audio.
|
||||||
|
/// Skipped unless KOKOROS_MODEL_PATH is set to a directory containing
|
||||||
|
/// kokoro-v1.0.onnx and voices-v1.0.bin.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn tts_produces_valid_speech() {
|
||||||
|
let model_dir = match std::env::var("KOKOROS_MODEL_PATH") {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => {
|
||||||
|
eprintln!("KOKOROS_MODEL_PATH not set, skipping integration test");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let model_path = format!("{}/kokoro-v1.0.onnx", model_dir);
|
||||||
|
let voices_path = format!("{}/voices-v1.0.bin", model_dir);
|
||||||
|
|
||||||
|
if !std::path::Path::new(&model_path).exists() {
|
||||||
|
eprintln!("Model file not found at {}, skipping", model_path);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tts = TTSKoko::new(&model_path, &voices_path).await;
|
||||||
|
|
||||||
|
let input_text = "Hello world, this is a test of speech synthesis.";
|
||||||
|
let out_path = std::env::temp_dir().join("kokoros_integration_test.wav");
|
||||||
|
let out_str = out_path.to_str().unwrap();
|
||||||
|
|
||||||
|
let samples = tts
|
||||||
|
.tts_raw_audio(input_text, "en-us", "af_heart", 1.0, None, None, None, None)
|
||||||
|
.expect("tts_raw_audio failed");
|
||||||
|
|
||||||
|
write_pcm16_wav(out_str, &samples, 24000).unwrap();
|
||||||
|
|
||||||
|
let data = std::fs::read(&out_path).unwrap();
|
||||||
|
std::fs::remove_file(&out_path).unwrap();
|
||||||
|
|
||||||
|
// --- WAV header sanity ---
|
||||||
|
assert_eq!(&data[0..4], b"RIFF");
|
||||||
|
assert_eq!(&data[8..12], b"WAVE");
|
||||||
|
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM
|
||||||
|
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
|
||||||
|
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // 16-bit
|
||||||
|
|
||||||
|
let num_samples = samples.len();
|
||||||
|
let duration_secs = num_samples as f64 / 24000.0;
|
||||||
|
|
||||||
|
// --- Duration check ---
|
||||||
|
// ~10 words should produce roughly 2-8 seconds of speech
|
||||||
|
assert!(
|
||||||
|
duration_secs > 1.0,
|
||||||
|
"Audio too short: {:.2}s for {} words",
|
||||||
|
duration_secs,
|
||||||
|
input_text.split_whitespace().count()
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
duration_secs < 15.0,
|
||||||
|
"Audio too long: {:.2}s for {} words",
|
||||||
|
duration_secs,
|
||||||
|
input_text.split_whitespace().count()
|
||||||
|
);
|
||||||
|
|
||||||
|
// --- Energy check: not silence ---
|
||||||
|
let rms = (samples.iter().map(|s| s * s).sum::<f32>() / num_samples as f32).sqrt();
|
||||||
|
assert!(
|
||||||
|
rms > 0.01,
|
||||||
|
"Audio is near-silence: RMS = {:.6}",
|
||||||
|
rms
|
||||||
|
);
|
||||||
|
|
||||||
|
// --- Not clipped/saturated: should have dynamic range ---
|
||||||
|
let max_abs = samples.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
|
||||||
|
assert!(
|
||||||
|
max_abs < 1.0,
|
||||||
|
"Audio is fully saturated (max |sample| = {:.4})",
|
||||||
|
max_abs
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
max_abs > 0.05,
|
||||||
|
"Audio has very low amplitude (max |sample| = {:.4})",
|
||||||
|
max_abs
|
||||||
|
);
|
||||||
|
|
||||||
|
// --- Speech-like spectral check ---
|
||||||
|
// Speech should have significant energy variation (not white noise or DC).
|
||||||
|
// Check that the signal has zero-crossings in a speech-like range (roughly
|
||||||
|
// 50-400 crossings per 24000 samples = 100-8000 Hz fundamental range).
|
||||||
|
let zero_crossings: usize = samples
|
||||||
|
.windows(2)
|
||||||
|
.filter(|w| (w[0] >= 0.0) != (w[1] >= 0.0))
|
||||||
|
.count();
|
||||||
|
let crossings_per_sec = zero_crossings as f64 / duration_secs;
|
||||||
|
// White noise at 24kHz would have ~12000 crossings/sec.
|
||||||
|
// Speech is typically 100-4000 crossings/sec.
|
||||||
|
assert!(
|
||||||
|
crossings_per_sec < 10000.0,
|
||||||
|
"Too many zero crossings ({:.0}/s) — likely noise, not speech",
|
||||||
|
crossings_per_sec
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
crossings_per_sec > 50.0,
|
||||||
|
"Too few zero crossings ({:.0}/s) — likely DC or silence, not speech",
|
||||||
|
crossings_per_sec
|
||||||
|
);
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"Integration test passed: duration={:.2}s, rms={:.4}, max={:.4}, zero_crossings={:.0}/s",
|
||||||
|
duration_secs, rms, max_abs, crossings_per_sec
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2514,6 +2514,132 @@
|
||||||
- lang_code:a
|
- lang_code:a
|
||||||
known_usecases:
|
known_usecases:
|
||||||
- tts
|
- tts
|
||||||
|
- name: "kokoros"
|
||||||
|
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||||
|
size: "327MB"
|
||||||
|
urls:
|
||||||
|
- https://github.com/lucasjinreal/Kokoros
|
||||||
|
license: apache-2.0
|
||||||
|
tags:
|
||||||
|
- tts
|
||||||
|
- kokoros
|
||||||
|
- cpu
|
||||||
|
- text-to-speech
|
||||||
|
- rust
|
||||||
|
description: |
|
||||||
|
Kokoros is a pure Rust TTS backend using the Kokoro v1.0 ONNX model (82M parameters).
|
||||||
|
Fast, streaming TTS with high quality. American English with af_heart voice.
|
||||||
|
overrides:
|
||||||
|
backend: "kokoros"
|
||||||
|
name: "kokoros"
|
||||||
|
description: "Kokoros Rust TTS - American English"
|
||||||
|
parameters:
|
||||||
|
model: "kokoro-v1.0.onnx"
|
||||||
|
voice: "af_heart"
|
||||||
|
options:
|
||||||
|
- lang_code:en-us
|
||||||
|
known_usecases:
|
||||||
|
- tts
|
||||||
|
files:
|
||||||
|
- filename: "kokoro-v1.0.onnx"
|
||||||
|
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||||
|
- filename: "voices-v1.0.bin"
|
||||||
|
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||||
|
- name: "kokoros-ja"
|
||||||
|
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||||
|
size: "327MB"
|
||||||
|
urls:
|
||||||
|
- https://github.com/lucasjinreal/Kokoros
|
||||||
|
license: apache-2.0
|
||||||
|
tags:
|
||||||
|
- tts
|
||||||
|
- kokoros
|
||||||
|
- japanese
|
||||||
|
- text-to-speech
|
||||||
|
description: |
|
||||||
|
Kokoros Rust TTS - Japanese. Uses the Kokoro v1.0 ONNX model with Japanese phonemization.
|
||||||
|
overrides:
|
||||||
|
backend: "kokoros"
|
||||||
|
name: "kokoros-ja"
|
||||||
|
description: "Kokoros Rust TTS - Japanese"
|
||||||
|
parameters:
|
||||||
|
model: "kokoro-v1.0.onnx"
|
||||||
|
voice: "jf_alpha"
|
||||||
|
options:
|
||||||
|
- lang_code:ja
|
||||||
|
known_usecases:
|
||||||
|
- tts
|
||||||
|
files:
|
||||||
|
- filename: "kokoro-v1.0.onnx"
|
||||||
|
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||||
|
- filename: "voices-v1.0.bin"
|
||||||
|
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||||
|
- name: "kokoros-cmn"
|
||||||
|
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||||
|
size: "327MB"
|
||||||
|
urls:
|
||||||
|
- https://github.com/lucasjinreal/Kokoros
|
||||||
|
license: apache-2.0
|
||||||
|
tags:
|
||||||
|
- tts
|
||||||
|
- kokoros
|
||||||
|
- chinese
|
||||||
|
- text-to-speech
|
||||||
|
description: |
|
||||||
|
Kokoros Rust TTS - Mandarin Chinese.
|
||||||
|
overrides:
|
||||||
|
backend: "kokoros"
|
||||||
|
name: "kokoros-cmn"
|
||||||
|
description: "Kokoros Rust TTS - Mandarin Chinese"
|
||||||
|
parameters:
|
||||||
|
model: "kokoro-v1.0.onnx"
|
||||||
|
voice: "zf_xiaobei"
|
||||||
|
options:
|
||||||
|
- lang_code:cmn
|
||||||
|
known_usecases:
|
||||||
|
- tts
|
||||||
|
files:
|
||||||
|
- filename: "kokoro-v1.0.onnx"
|
||||||
|
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||||
|
- filename: "voices-v1.0.bin"
|
||||||
|
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||||
|
- name: "kokoros-de"
|
||||||
|
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||||
|
size: "327MB"
|
||||||
|
urls:
|
||||||
|
- https://github.com/lucasjinreal/Kokoros
|
||||||
|
license: apache-2.0
|
||||||
|
tags:
|
||||||
|
- tts
|
||||||
|
- kokoros
|
||||||
|
- german
|
||||||
|
- text-to-speech
|
||||||
|
description: |
|
||||||
|
Kokoros Rust TTS - German.
|
||||||
|
overrides:
|
||||||
|
backend: "kokoros"
|
||||||
|
name: "kokoros-de"
|
||||||
|
description: "Kokoros Rust TTS - German"
|
||||||
|
parameters:
|
||||||
|
model: "kokoro-v1.0.onnx"
|
||||||
|
voice: "df_greta"
|
||||||
|
options:
|
||||||
|
- lang_code:de
|
||||||
|
known_usecases:
|
||||||
|
- tts
|
||||||
|
files:
|
||||||
|
- filename: "kokoro-v1.0.onnx"
|
||||||
|
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||||
|
- filename: "voices-v1.0.bin"
|
||||||
|
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||||
|
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||||
- name: "kitten-tts"
|
- name: "kitten-tts"
|
||||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||||
urls:
|
urls:
|
||||||
|
|
|
||||||
3
gallery/kokoros.yaml
Normal file
3
gallery/kokoros.yaml
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
config_file: |
|
||||||
|
backend: kokoros
|
||||||
|
|
@ -21,6 +21,9 @@ function inferBackendPath(item) {
|
||||||
if (item.dockerfile.endsWith("golang")) {
|
if (item.dockerfile.endsWith("golang")) {
|
||||||
return `backend/go/${item.backend}/`;
|
return `backend/go/${item.backend}/`;
|
||||||
}
|
}
|
||||||
|
if (item.dockerfile.endsWith("rust")) {
|
||||||
|
return `backend/rust/${item.backend}/`;
|
||||||
|
}
|
||||||
if (item.dockerfile.endsWith("llama-cpp")) {
|
if (item.dockerfile.endsWith("llama-cpp")) {
|
||||||
return `backend/cpp/llama-cpp/`;
|
return `backend/cpp/llama-cpp/`;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue