mirror of
https://github.com/justLV/onju-v2
synced 2026-04-21 15:47:55 +00:00
OpenClaw managed backend, VAD-aware interrupt, firmware fixes
- Add managed conversation backend for OpenClaw (x-openclaw-message-channel header, user field for device identity) - Replace aggressive interrupt logic with VAD-aware check: only interrupt on actual speech, not background noise/trailing packets - Fix 0xDD timeout units (was milliseconds, now seconds) and keep callActive alive with 30s hold during LLM+TTS processing - Set callActive on boot for VOX mode so device accepts audio without tap - Mic timeout no longer kills callActive — only double-tap ends the call - LED feedback: scale to configured led_power, let device handle fade-down - Add greeting toggle, TTS/SEND logging, pyserial dep, setuptools config
This commit is contained in:
parent
a3ac260e1c
commit
19d48d4e3c
6 changed files with 37 additions and 20 deletions
|
|
@ -296,11 +296,13 @@ void setup()
|
|||
udp.write(reinterpret_cast<const uint8_t *>(mcast_string.c_str()), mcast_string.length());
|
||||
udp.endPacket();
|
||||
|
||||
// PTT: auto-start call on boot — bridge will start Sesame session on discovery
|
||||
callActive = true;
|
||||
if (PTT_MODE) {
|
||||
callActive = true;
|
||||
Serial.println("PTT mode: call auto-started on boot");
|
||||
setLed(0, 100, 255, 200, 3); // blue pulse = PTT idle, waiting for bridge
|
||||
} else {
|
||||
Serial.println("VOX mode: call active on boot");
|
||||
mic_timeout = millis() + MIC_LISTEN_MS;
|
||||
}
|
||||
|
||||
i2s_driver_install(I2S_NUM, &i2s_config, 0, NULL);
|
||||
|
|
@ -769,7 +771,7 @@ void loop()
|
|||
{
|
||||
Serial.println("Received mic timeout command (0xDD)");
|
||||
uint16_t timeout = header[1] << 8 | header[2];
|
||||
mic_timeout = millis() + timeout;
|
||||
mic_timeout = millis() + (uint32_t)timeout * 1000;
|
||||
client.stop();
|
||||
}
|
||||
else
|
||||
|
|
@ -948,7 +950,7 @@ void micTask(void *pvParameters)
|
|||
if (prevState)
|
||||
{
|
||||
Serial.println("Timeout reached");
|
||||
callActive = false;
|
||||
// Don't set callActive = false — only double-tap should end the call
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ conversation:
|
|||
api_key: "${OPENCLAW_GATEWAY_TOKEN}" # env var reference
|
||||
model: "openclaw/default"
|
||||
max_tokens: 300
|
||||
session_prefix: "onju-" # session key = prefix + device hostname
|
||||
message_channel: "onju-voice" # x-openclaw-message-channel header
|
||||
# provider_model: "anthropic/claude-opus-4-6" # optional: override backend LLM
|
||||
|
||||
tts:
|
||||
|
|
|
|||
|
|
@ -21,13 +21,12 @@ class ManagedConversation:
|
|||
def __init__(self, cfg: dict, device_id: str):
|
||||
self.cfg = cfg
|
||||
self.device_id = device_id
|
||||
session_prefix = cfg.get("session_prefix", "onju-")
|
||||
self.session_key = f"{session_prefix}{device_id}"
|
||||
self.message_channel = cfg.get("message_channel", "onju-voice")
|
||||
self.client = AsyncOpenAI(
|
||||
base_url=cfg["base_url"],
|
||||
api_key=_resolve_env(cfg.get("api_key", "none")),
|
||||
default_headers={
|
||||
"x-openclaw-session-key": self.session_key,
|
||||
"x-openclaw-message-channel": self.message_channel,
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -36,6 +35,7 @@ class ManagedConversation:
|
|||
model=self.cfg.get("model", "openclaw/default"),
|
||||
messages=[{"role": "user", "content": user_text}],
|
||||
max_tokens=self.cfg.get("max_tokens", 300),
|
||||
user=self.device_id,
|
||||
)
|
||||
|
||||
extra_headers = {}
|
||||
|
|
|
|||
|
|
@ -87,28 +87,35 @@ async def udp_listener(config: dict, manager: DeviceManager, utterance_queue: as
|
|||
last_packet_time[device.hostname] = now
|
||||
pcm = decode_ulaw(data)
|
||||
|
||||
# Interrupt current response if device sends new audio while processing
|
||||
if device.processing:
|
||||
device.interrupted.set()
|
||||
|
||||
if device.ptt:
|
||||
# PTT: just buffer, no VAD needed
|
||||
if device.processing:
|
||||
continue
|
||||
device.ptt_buffer.append(pcm)
|
||||
else:
|
||||
# VOX: run VAD
|
||||
utterance = device.vad.process_frame(pcm)
|
||||
|
||||
# Interrupt only on actual speech (not background noise)
|
||||
if device.processing:
|
||||
if device.vad.speech_prob > config["vad"]["threshold"]:
|
||||
device.interrupted.set()
|
||||
continue
|
||||
|
||||
# LED feedback (only for VOX devices)
|
||||
# Only send a new blink when VAD sees a peak — the device
|
||||
# handles fade-down itself via updateLedTask.
|
||||
prob = device.vad.speech_prob
|
||||
if prob > 0.1:
|
||||
device.led_power = min(255, int(prob * 255))
|
||||
new_level = int(prob * dev_cfg["led_power"]) if prob > 0.1 else 0
|
||||
if new_level > device.led_power:
|
||||
device.led_power = min(dev_cfg["led_power"], new_level)
|
||||
if now - device.led_update_time > dev_cfg["led_update_period"]:
|
||||
device.led_update_time = now
|
||||
if device.led_power > 0:
|
||||
asyncio.create_task(
|
||||
send_led_blink(device.ip, tcp_port, device.led_power, fade=dev_cfg["led_fade"])
|
||||
)
|
||||
device.led_power = 0
|
||||
device.led_power = 0
|
||||
|
||||
if utterance is not None:
|
||||
log.info(f"VAD utterance from {device.hostname} ({len(utterance)/sample_rate:.1f}s)")
|
||||
|
|
@ -120,7 +127,7 @@ async def greet_device(device: Device, config: dict):
|
|||
dev_cfg = config["device"]
|
||||
tcp_port = config["network"]["tcp_port"]
|
||||
greeting_path = dev_cfg.get("greeting_wav")
|
||||
if not greeting_path or not os.path.exists(greeting_path):
|
||||
if not dev_cfg.get("greeting", True) or not greeting_path or not os.path.exists(greeting_path):
|
||||
return
|
||||
try:
|
||||
from pydub import AudioSegment
|
||||
|
|
@ -173,7 +180,8 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
|
|||
device.interrupted.clear()
|
||||
|
||||
try:
|
||||
# Tell VOX devices to stop listening while we process
|
||||
# Tell VOX devices to stop listening while we process.
|
||||
# Uses a 30s hold so callActive stays true on the device.
|
||||
if not device.ptt:
|
||||
await send_stop_listening(device.ip, tcp_port)
|
||||
|
||||
|
|
@ -217,6 +225,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
|
|||
# TTS
|
||||
try:
|
||||
pcm_response = await tts.synthesize(response_text, device.voice, config)
|
||||
log.info(f"TTS {len(pcm_response)} bytes ({len(pcm_response)/32000:.1f}s)")
|
||||
except Exception as e:
|
||||
log.error(f"TTS failed: {e}")
|
||||
continue
|
||||
|
|
@ -229,6 +238,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
|
|||
# Opus encode and send
|
||||
frames = opus_encode(pcm_response, config["audio"]["sample_rate"], config["audio"]["opus_frame_size"])
|
||||
payload = opus_frames_to_tcp_payload(frames)
|
||||
log.info(f"SEND {len(frames)} opus frames to {device.ip}")
|
||||
await send_audio(device.ip, tcp_port, payload,
|
||||
mic_timeout=dev_cfg["default_mic_timeout"],
|
||||
volume=dev_cfg["default_volume"],
|
||||
|
|
|
|||
|
|
@ -46,8 +46,9 @@ async def send_led_blink(ip: str, port: int, intensity: int, r: int = 255, g: in
|
|||
await send_tcp(ip, port, header, timeout=0.1)
|
||||
|
||||
|
||||
async def send_stop_listening(ip: str, port: int):
|
||||
async def send_stop_listening(ip: str, port: int, hold_s: int = 30):
|
||||
# header[0] 0xDD for mic timeout
|
||||
# header[1:2] timeout = 0 (stop)
|
||||
header = bytes([0xDD, 0, 0, 0, 0, 0])
|
||||
# header[1:2] timeout in seconds — nonzero to keep callActive alive
|
||||
# on the device while server processes LLM + TTS
|
||||
header = bytes([0xDD, (hold_s >> 8) & 0xFF, hold_s & 0xFF, 0, 0, 0])
|
||||
await send_tcp(ip, port, header, timeout=0.2)
|
||||
|
|
|
|||
|
|
@ -12,8 +12,12 @@ dependencies = [
|
|||
"PyYAML",
|
||||
"scipy",
|
||||
"silero-vad",
|
||||
"pyserial",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["pipeline*"]
|
||||
|
||||
[project.optional-dependencies]
|
||||
tts-local = ["mlx-audio>=0.3.1"]
|
||||
mic = ["pyaudio"]
|
||||
|
|
|
|||
Loading…
Reference in a new issue