onju-v2/pipeline/config.yaml.example
justLV 13f9d59245 Add Qwen3-TTS as local TTS backend with voice cloning
Adds mlx-audio-based Qwen3-TTS as an alternative to ElevenLabs,
enabling fully offline voice synthesis with voice cloning from a
short reference audio clip. Benchmarked at 0.52x RTF (sub-realtime)
on Apple Silicon with the 1.7B-Base-4bit model.
2026-02-09 13:53:46 -08:00

52 lines
1.6 KiB
Text

asr:
url: "http://localhost:8100"
llm:
base_url: "http://localhost:8080/v1" # mlx_lm.server (or Ollama, OpenRouter, OpenAI, etc.)
api_key: "none" # set if using a hosted API
model: "mlx-community/gemma-3-4b-it-qat-4bit"
max_messages: 20
max_tokens: 300
system_prompt: "You are a helpful voice assistant. Keep responses concise (under 2 sentences)."
tts:
backend: "qwen3" # "qwen3" (local) or "elevenlabs" (cloud)
qwen3:
url: "http://localhost:8880" # mlx-audio server (start with: python -m mlx_audio.server --port 8880)
model: "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-4bit"
ref_audio: "" # path to 3-10s reference clip for voice cloning
ref_text: "" # optional transcript of reference audio
elevenlabs:
api_key: "" # your ElevenLabs API key
default_voice: "Rachel"
voices:
Rachel: "21m00Tcm4TlvDq8ikWAM" # add your voice IDs here
vad:
threshold: 0.5 # speech onset probability
neg_threshold: 0.35 # speech offset probability (hysteresis)
silence_time: 1.5
pre_buffer_s: 1.0
network:
udp_port: 3000
tcp_port: 3001
multicast_group: "239.0.0.1"
multicast_port: 12345
audio:
sample_rate: 16000
chunk_size: 512 # 32ms at 16kHz (Silero VAD requirement)
opus_frame_size: 320 # 20ms at 16kHz
device:
default_volume: 14
default_mic_timeout: 60
led_fade: 6
led_power: 35
led_update_period: 0.2
persist_file: "data/devices.json"
greeting_wav: "data/hello_imhere.wav"
logging:
level: "INFO"