OpenClaw managed backend, VAD-aware interrupt, firmware fixes

- Add managed conversation backend for OpenClaw (x-openclaw-message-channel
  header, user field for device identity)
- Replace aggressive interrupt logic with VAD-aware check: only interrupt
  on actual speech, not background noise/trailing packets
- Fix 0xDD timeout units (was milliseconds, now seconds) and keep callActive
  alive with 30s hold during LLM+TTS processing
- Set callActive on boot for VOX mode so device accepts audio without tap
- Mic timeout no longer kills callActive — only double-tap ends the call
- LED feedback: scale to configured led_power, let device handle fade-down
- Add greeting toggle, TTS/SEND logging, pyserial dep, setuptools config
This commit is contained in:
justLV 2026-04-07 20:16:21 -07:00
parent a3ac260e1c
commit 19d48d4e3c
6 changed files with 37 additions and 20 deletions

View file

@ -296,11 +296,13 @@ void setup()
udp.write(reinterpret_cast<const uint8_t *>(mcast_string.c_str()), mcast_string.length());
udp.endPacket();
// PTT: auto-start call on boot — bridge will start Sesame session on discovery
callActive = true;
if (PTT_MODE) {
callActive = true;
Serial.println("PTT mode: call auto-started on boot");
setLed(0, 100, 255, 200, 3); // blue pulse = PTT idle, waiting for bridge
} else {
Serial.println("VOX mode: call active on boot");
mic_timeout = millis() + MIC_LISTEN_MS;
}
i2s_driver_install(I2S_NUM, &i2s_config, 0, NULL);
@ -769,7 +771,7 @@ void loop()
{
Serial.println("Received mic timeout command (0xDD)");
uint16_t timeout = header[1] << 8 | header[2];
mic_timeout = millis() + timeout;
mic_timeout = millis() + (uint32_t)timeout * 1000;
client.stop();
}
else
@ -948,7 +950,7 @@ void micTask(void *pvParameters)
if (prevState)
{
Serial.println("Timeout reached");
callActive = false;
// Don't set callActive = false — only double-tap should end the call
}
}
else

View file

@ -22,7 +22,7 @@ conversation:
api_key: "${OPENCLAW_GATEWAY_TOKEN}" # env var reference
model: "openclaw/default"
max_tokens: 300
session_prefix: "onju-" # session key = prefix + device hostname
message_channel: "onju-voice" # x-openclaw-message-channel header
# provider_model: "anthropic/claude-opus-4-6" # optional: override backend LLM
tts:

View file

@ -21,13 +21,12 @@ class ManagedConversation:
def __init__(self, cfg: dict, device_id: str):
self.cfg = cfg
self.device_id = device_id
session_prefix = cfg.get("session_prefix", "onju-")
self.session_key = f"{session_prefix}{device_id}"
self.message_channel = cfg.get("message_channel", "onju-voice")
self.client = AsyncOpenAI(
base_url=cfg["base_url"],
api_key=_resolve_env(cfg.get("api_key", "none")),
default_headers={
"x-openclaw-session-key": self.session_key,
"x-openclaw-message-channel": self.message_channel,
},
)
@ -36,6 +35,7 @@ class ManagedConversation:
model=self.cfg.get("model", "openclaw/default"),
messages=[{"role": "user", "content": user_text}],
max_tokens=self.cfg.get("max_tokens", 300),
user=self.device_id,
)
extra_headers = {}

View file

@ -87,28 +87,35 @@ async def udp_listener(config: dict, manager: DeviceManager, utterance_queue: as
last_packet_time[device.hostname] = now
pcm = decode_ulaw(data)
# Interrupt current response if device sends new audio while processing
if device.processing:
device.interrupted.set()
if device.ptt:
# PTT: just buffer, no VAD needed
if device.processing:
continue
device.ptt_buffer.append(pcm)
else:
# VOX: run VAD
utterance = device.vad.process_frame(pcm)
# Interrupt only on actual speech (not background noise)
if device.processing:
if device.vad.speech_prob > config["vad"]["threshold"]:
device.interrupted.set()
continue
# LED feedback (only for VOX devices)
# Only send a new blink when VAD sees a peak — the device
# handles fade-down itself via updateLedTask.
prob = device.vad.speech_prob
if prob > 0.1:
device.led_power = min(255, int(prob * 255))
new_level = int(prob * dev_cfg["led_power"]) if prob > 0.1 else 0
if new_level > device.led_power:
device.led_power = min(dev_cfg["led_power"], new_level)
if now - device.led_update_time > dev_cfg["led_update_period"]:
device.led_update_time = now
if device.led_power > 0:
asyncio.create_task(
send_led_blink(device.ip, tcp_port, device.led_power, fade=dev_cfg["led_fade"])
)
device.led_power = 0
device.led_power = 0
if utterance is not None:
log.info(f"VAD utterance from {device.hostname} ({len(utterance)/sample_rate:.1f}s)")
@ -120,7 +127,7 @@ async def greet_device(device: Device, config: dict):
dev_cfg = config["device"]
tcp_port = config["network"]["tcp_port"]
greeting_path = dev_cfg.get("greeting_wav")
if not greeting_path or not os.path.exists(greeting_path):
if not dev_cfg.get("greeting", True) or not greeting_path or not os.path.exists(greeting_path):
return
try:
from pydub import AudioSegment
@ -173,7 +180,8 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
device.interrupted.clear()
try:
# Tell VOX devices to stop listening while we process
# Tell VOX devices to stop listening while we process.
# Uses a 30s hold so callActive stays true on the device.
if not device.ptt:
await send_stop_listening(device.ip, tcp_port)
@ -217,6 +225,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
# TTS
try:
pcm_response = await tts.synthesize(response_text, device.voice, config)
log.info(f"TTS {len(pcm_response)} bytes ({len(pcm_response)/32000:.1f}s)")
except Exception as e:
log.error(f"TTS failed: {e}")
continue
@ -229,6 +238,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
# Opus encode and send
frames = opus_encode(pcm_response, config["audio"]["sample_rate"], config["audio"]["opus_frame_size"])
payload = opus_frames_to_tcp_payload(frames)
log.info(f"SEND {len(frames)} opus frames to {device.ip}")
await send_audio(device.ip, tcp_port, payload,
mic_timeout=dev_cfg["default_mic_timeout"],
volume=dev_cfg["default_volume"],

View file

@ -46,8 +46,9 @@ async def send_led_blink(ip: str, port: int, intensity: int, r: int = 255, g: in
await send_tcp(ip, port, header, timeout=0.1)
async def send_stop_listening(ip: str, port: int):
async def send_stop_listening(ip: str, port: int, hold_s: int = 30):
# header[0] 0xDD for mic timeout
# header[1:2] timeout = 0 (stop)
header = bytes([0xDD, 0, 0, 0, 0, 0])
# header[1:2] timeout in seconds — nonzero to keep callActive alive
# on the device while server processes LLM + TTS
header = bytes([0xDD, (hold_s >> 8) & 0xFF, hold_s & 0xFF, 0, 0, 0])
await send_tcp(ip, port, header, timeout=0.2)

View file

@ -12,8 +12,12 @@ dependencies = [
"PyYAML",
"scipy",
"silero-vad",
"pyserial",
]
[tool.setuptools.packages.find]
include = ["pipeline*"]
[project.optional-dependencies]
tts-local = ["mlx-audio>=0.3.1"]
mic = ["pyaudio"]