OpenClaw managed backend, VAD-aware interrupt, firmware fixes

- Add managed conversation backend for OpenClaw (x-openclaw-message-channel header, user field for device identity) - Replace aggressive interrupt logic with VAD-aware check: only interrupt on actual speech, not background noise/trailing packets - Fix 0xDD timeout units (was milliseconds, now seconds) and keep callActive alive with 30s hold during LLM+TTS processing - Set callActive on boot for VOX mode so device accepts audio without tap - Mic timeout no longer kills callActive — only double-tap ends the call - LED feedback: scale to configured led_power, let device handle fade-down - Add greeting toggle, TTS/SEND logging, pyserial dep, setuptools config
2026-04-21 15:47:55 +00:00 · 2026-04-07 20:16:21 -07:00 · 2026-04-07 20:16:21 -07:00 · 19d48d4e3c
commit 19d48d4e3c
parent a3ac260e1c
6 changed files with 37 additions and 20 deletions
--- a/onjuino/onjuino.ino
+++ b/onjuino/onjuino.ino
@ -296,11 +296,13 @@ void setup()
    udp.write(reinterpret_cast<const uint8_t *>(mcast_string.c_str()), mcast_string.length());
    udp.endPacket();

-    // PTT: auto-start call on boot — bridge will start Sesame session on discovery
+    callActive = true;
    if (PTT_MODE) {
-        callActive = true;
        Serial.println("PTT mode: call auto-started on boot");
        setLed(0, 100, 255, 200, 3); // blue pulse = PTT idle, waiting for bridge
+    } else {
+        Serial.println("VOX mode: call active on boot");
+        mic_timeout = millis() + MIC_LISTEN_MS;
    }

    i2s_driver_install(I2S_NUM, &i2s_config, 0, NULL);
@ -769,7 +771,7 @@ void loop()
        {
            Serial.println("Received mic timeout command (0xDD)");
            uint16_t timeout = header[1] << 8 | header[2];
-            mic_timeout = millis() + timeout;
+            mic_timeout = millis() + (uint32_t)timeout * 1000;
            client.stop();
        }
        else
@ -948,7 +950,7 @@ void micTask(void *pvParameters)
            if (prevState)
            {
                Serial.println("Timeout reached");
-                callActive = false;
+                // Don't set callActive = false — only double-tap should end the call
            }
        }
        else
--- a/pipeline/config.yaml.example
+++ b/pipeline/config.yaml.example
@ -22,7 +22,7 @@ conversation:
    api_key: "${OPENCLAW_GATEWAY_TOKEN}"     # env var reference
    model: "openclaw/default"
    max_tokens: 300
-    session_prefix: "onju-"                  # session key = prefix + device hostname
+    message_channel: "onju-voice"            # x-openclaw-message-channel header
    # provider_model: "anthropic/claude-opus-4-6"  # optional: override backend LLM

 tts:
--- a/pipeline/conversation/managed.py
+++ b/pipeline/conversation/managed.py
@ -21,13 +21,12 @@ class ManagedConversation:
    def __init__(self, cfg: dict, device_id: str):
        self.cfg = cfg
        self.device_id = device_id
-        session_prefix = cfg.get("session_prefix", "onju-")
-        self.session_key = f"{session_prefix}{device_id}"
+        self.message_channel = cfg.get("message_channel", "onju-voice")
        self.client = AsyncOpenAI(
            base_url=cfg["base_url"],
            api_key=_resolve_env(cfg.get("api_key", "none")),
            default_headers={
-                "x-openclaw-session-key": self.session_key,
+                "x-openclaw-message-channel": self.message_channel,
            },
        )

@ -36,6 +35,7 @@ class ManagedConversation:
            model=self.cfg.get("model", "openclaw/default"),
            messages=[{"role": "user", "content": user_text}],
            max_tokens=self.cfg.get("max_tokens", 300),
+            user=self.device_id,
        )

        extra_headers = {}
--- a/pipeline/main.py
+++ b/pipeline/main.py
@ -87,28 +87,35 @@ async def udp_listener(config: dict, manager: DeviceManager, utterance_queue: as
        last_packet_time[device.hostname] = now
        pcm = decode_ulaw(data)

-        # Interrupt current response if device sends new audio while processing
-        if device.processing:
-            device.interrupted.set()
-
        if device.ptt:
            # PTT: just buffer, no VAD needed
+            if device.processing:
+                continue
            device.ptt_buffer.append(pcm)
        else:
            # VOX: run VAD
            utterance = device.vad.process_frame(pcm)

+            # Interrupt only on actual speech (not background noise)
+            if device.processing:
+                if device.vad.speech_prob > config["vad"]["threshold"]:
+                    device.interrupted.set()
+                continue
+
            # LED feedback (only for VOX devices)
+            # Only send a new blink when VAD sees a peak — the device
+            # handles fade-down itself via updateLedTask.
            prob = device.vad.speech_prob
-            if prob > 0.1:
-                device.led_power = min(255, int(prob * 255))
+            new_level = int(prob * dev_cfg["led_power"]) if prob > 0.1 else 0
+            if new_level > device.led_power:
+                device.led_power = min(dev_cfg["led_power"], new_level)
            if now - device.led_update_time > dev_cfg["led_update_period"]:
                device.led_update_time = now
                if device.led_power > 0:
                    asyncio.create_task(
                        send_led_blink(device.ip, tcp_port, device.led_power, fade=dev_cfg["led_fade"])
                    )
-                device.led_power = 0
+                    device.led_power = 0

            if utterance is not None:
                log.info(f"VAD  utterance from {device.hostname} ({len(utterance)/sample_rate:.1f}s)")
@ -120,7 +127,7 @@ async def greet_device(device: Device, config: dict):
    dev_cfg = config["device"]
    tcp_port = config["network"]["tcp_port"]
    greeting_path = dev_cfg.get("greeting_wav")
-    if not greeting_path or not os.path.exists(greeting_path):
+    if not dev_cfg.get("greeting", True) or not greeting_path or not os.path.exists(greeting_path):
        return
    try:
        from pydub import AudioSegment
@ -173,7 +180,8 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
        device.interrupted.clear()

        try:
-            # Tell VOX devices to stop listening while we process
+            # Tell VOX devices to stop listening while we process.
+            # Uses a 30s hold so callActive stays true on the device.
            if not device.ptt:
                await send_stop_listening(device.ip, tcp_port)

@ -217,6 +225,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
            # TTS
            try:
                pcm_response = await tts.synthesize(response_text, device.voice, config)
+                log.info(f"TTS  {len(pcm_response)} bytes ({len(pcm_response)/32000:.1f}s)")
            except Exception as e:
                log.error(f"TTS  failed: {e}")
                continue
@ -229,6 +238,7 @@ async def process_utterances(config: dict, manager: DeviceManager, utterance_que
            # Opus encode and send
            frames = opus_encode(pcm_response, config["audio"]["sample_rate"], config["audio"]["opus_frame_size"])
            payload = opus_frames_to_tcp_payload(frames)
+            log.info(f"SEND  {len(frames)} opus frames to {device.ip}")
            await send_audio(device.ip, tcp_port, payload,
                             mic_timeout=dev_cfg["default_mic_timeout"],
                             volume=dev_cfg["default_volume"],
--- a/pipeline/protocol.py
+++ b/pipeline/protocol.py
@ -46,8 +46,9 @@ async def send_led_blink(ip: str, port: int, intensity: int, r: int = 255, g: in
    await send_tcp(ip, port, header, timeout=0.1)


-async def send_stop_listening(ip: str, port: int):
+async def send_stop_listening(ip: str, port: int, hold_s: int = 30):
    # header[0]   0xDD for mic timeout
-    # header[1:2] timeout = 0 (stop)
-    header = bytes([0xDD, 0, 0, 0, 0, 0])
+    # header[1:2] timeout in seconds — nonzero to keep callActive alive
+    #             on the device while server processes LLM + TTS
+    header = bytes([0xDD, (hold_s >> 8) & 0xFF, hold_s & 0xFF, 0, 0, 0])
    await send_tcp(ip, port, header, timeout=0.2)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,8 +12,12 @@ dependencies = [
    "PyYAML",
    "scipy",
    "silero-vad",
+    "pyserial",
 ]

+[tool.setuptools.packages.find]
+include = ["pipeline*"]
+
 [project.optional-dependencies]
 tts-local = ["mlx-audio>=0.3.1"]
 mic = ["pyaudio"]