onju-v2/record_from_esp32.py
justLV 0c9c75b3bf Replace webrtcvad with Silero VAD (ONNX, no PyTorch)
Switch from webrtcvad's binary is_speech to Silero VAD's calibrated
float probability via direct ONNX session calls with numpy. The LSTM
provides temporal smoothing natively, eliminating the sliding window
hack. Frame size changes from 480 (30ms) to 512 (32ms) end-to-end
to match Silero's requirements.

Consolidate pipeline/requirements.txt into root requirements.txt,
swap webrtcvad+setuptools for silero-vad+onnxruntime.
2026-02-07 17:00:02 -08:00

138 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Record audio from ESP32 via UDP
"""
import socket
import wave
import time
import numpy as np
ESP32_IP = '192.168.68.90' # Update this if your ESP32 has a different IP
UDP_PORT = 3000
CHUNK_SIZE = 512
# μ-law decode table
ULAW_DECODE_TABLE = np.array([
-32124,-31100,-30076,-29052,-28028,-27004,-25980,-24956,
-23932,-22908,-21884,-20860,-19836,-18812,-17788,-16764,
-15996,-15484,-14972,-14460,-13948,-13436,-12924,-12412,
-11900,-11388,-10876,-10364,-9852,-9340,-8828,-8316,
-7932,-7676,-7420,-7164,-6908,-6652,-6396,-6140,
-5884,-5628,-5372,-5116,-4860,-4604,-4348,-4092,
-3900,-3772,-3644,-3516,-3388,-3260,-3132,-3004,
-2876,-2748,-2620,-2492,-2364,-2236,-2108,-1980,
-1884,-1820,-1756,-1692,-1628,-1564,-1500,-1436,
-1372,-1308,-1244,-1180,-1116,-1052,-988,-924,
-876,-844,-812,-780,-748,-716,-684,-652,
-620,-588,-556,-524,-492,-460,-428,-396,
-372,-356,-340,-324,-308,-292,-276,-260,
-244,-228,-212,-196,-180,-164,-148,-132,
-120,-112,-104,-96,-88,-80,-72,-64,
-56,-48,-40,-32,-24,-16,-8,0,
32124,31100,30076,29052,28028,27004,25980,24956,
23932,22908,21884,20860,19836,18812,17788,16764,
15996,15484,14972,14460,13948,13436,12924,12412,
11900,11388,10876,10364,9852,9340,8828,8316,
7932,7676,7420,7164,6908,6652,6396,6140,
5884,5628,5372,5116,4860,4604,4348,4092,
3900,3772,3644,3516,3388,3260,3132,3004,
2876,2748,2620,2492,2364,2236,2108,1980,
1884,1820,1756,1692,1628,1564,1500,1436,
1372,1308,1244,1180,1116,1052,988,924,
876,844,812,780,748,716,684,652,
620,588,556,524,492,460,428,396,
372,356,340,324,308,292,276,260,
244,228,212,196,180,164,148,132,
120,112,104,96,88,80,72,64,
56,48,40,32,24,16,8,0
], dtype=np.int16)
def decode_ulaw(ulaw_bytes):
return ULAW_DECODE_TABLE[np.frombuffer(ulaw_bytes, dtype=np.uint8)]
print(f"Step 1: Greeting ESP32 at {ESP32_IP} to enable mic...")
# Send greeting via TCP to set mic_timeout
try:
tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcp_sock.settimeout(5.0)
tcp_sock.connect((ESP32_IP, 3001))
# Send greeting: 0xAA header + 60 second timeout
header = bytearray(6)
header[0] = 0xAA # Audio command
header[1] = 0x00 # Timeout high byte
header[2] = 60 # Timeout low byte (60 seconds)
header[3] = 14 # Speaker volume
header[4] = 5 # LED fade
header[5] = 0 # Unused
tcp_sock.send(header)
tcp_sock.close()
print(f"✓ Greeted ESP32, mic enabled for 60 seconds")
except Exception as e:
print(f"✗ Failed to greet ESP32: {e}")
exit(1)
time.sleep(1)
print(f"\nStep 3: Recording audio from UDP port {UDP_PORT}...")
print("Talk now!\n")
# Record UDP audio
udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
udp_sock.bind(('0.0.0.0', UDP_PORT))
udp_sock.settimeout(1.0)
audio_frames = []
packet_count = 0
start_time = time.time()
duration = 10
try:
while (time.time() - start_time) < duration:
try:
data, addr = udp_sock.recvfrom(2048)
packet_count += 1
# Auto-detect compression
if len(data) == CHUNK_SIZE:
samples = decode_ulaw(data)
mode = "μ-law"
elif len(data) == CHUNK_SIZE * 2:
samples = np.frombuffer(data, dtype=np.int16)
mode = "raw"
else:
continue
audio_frames.append(samples)
# Progress indicator
if packet_count % 10 == 0:
elapsed = time.time() - start_time
rms = np.sqrt(np.mean(samples.astype(np.float32) ** 2))
print(f"[{elapsed:4.1f}s] Packets: {packet_count:3d} | RMS: {rms:5.0f} | Mode: {mode}", end='\r', flush=True)
except socket.timeout:
continue
except KeyboardInterrupt:
print("\n\nStopped by user")
print(f"\n\nRecording complete!")
print(f"Packets received: {packet_count}")
if audio_frames:
audio_data = np.concatenate(audio_frames)
output_file = 'recording.wav'
with wave.open(output_file, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(16000)
wav_file.writeframes(audio_data.tobytes())
duration_sec = len(audio_data) / 16000
print(f"Saved {duration_sec:.1f}s to {output_file}")
else:
print("No audio received!")