mirror of
https://github.com/justLV/onju-v2
synced 2026-04-21 15:47:55 +00:00
Switch from webrtcvad's binary is_speech to Silero VAD's calibrated float probability via direct ONNX session calls with numpy. The LSTM provides temporal smoothing natively, eliminating the sliding window hack. Frame size changes from 480 (30ms) to 512 (32ms) end-to-end to match Silero's requirements. Consolidate pipeline/requirements.txt into root requirements.txt, swap webrtcvad+setuptools for silero-vad+onnxruntime.
138 lines
4.3 KiB
Python
138 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Record audio from ESP32 via UDP
|
|
"""
|
|
import socket
|
|
import wave
|
|
import time
|
|
import numpy as np
|
|
|
|
ESP32_IP = '192.168.68.90' # Update this if your ESP32 has a different IP
|
|
UDP_PORT = 3000
|
|
CHUNK_SIZE = 512
|
|
|
|
# μ-law decode table
|
|
ULAW_DECODE_TABLE = np.array([
|
|
-32124,-31100,-30076,-29052,-28028,-27004,-25980,-24956,
|
|
-23932,-22908,-21884,-20860,-19836,-18812,-17788,-16764,
|
|
-15996,-15484,-14972,-14460,-13948,-13436,-12924,-12412,
|
|
-11900,-11388,-10876,-10364,-9852,-9340,-8828,-8316,
|
|
-7932,-7676,-7420,-7164,-6908,-6652,-6396,-6140,
|
|
-5884,-5628,-5372,-5116,-4860,-4604,-4348,-4092,
|
|
-3900,-3772,-3644,-3516,-3388,-3260,-3132,-3004,
|
|
-2876,-2748,-2620,-2492,-2364,-2236,-2108,-1980,
|
|
-1884,-1820,-1756,-1692,-1628,-1564,-1500,-1436,
|
|
-1372,-1308,-1244,-1180,-1116,-1052,-988,-924,
|
|
-876,-844,-812,-780,-748,-716,-684,-652,
|
|
-620,-588,-556,-524,-492,-460,-428,-396,
|
|
-372,-356,-340,-324,-308,-292,-276,-260,
|
|
-244,-228,-212,-196,-180,-164,-148,-132,
|
|
-120,-112,-104,-96,-88,-80,-72,-64,
|
|
-56,-48,-40,-32,-24,-16,-8,0,
|
|
32124,31100,30076,29052,28028,27004,25980,24956,
|
|
23932,22908,21884,20860,19836,18812,17788,16764,
|
|
15996,15484,14972,14460,13948,13436,12924,12412,
|
|
11900,11388,10876,10364,9852,9340,8828,8316,
|
|
7932,7676,7420,7164,6908,6652,6396,6140,
|
|
5884,5628,5372,5116,4860,4604,4348,4092,
|
|
3900,3772,3644,3516,3388,3260,3132,3004,
|
|
2876,2748,2620,2492,2364,2236,2108,1980,
|
|
1884,1820,1756,1692,1628,1564,1500,1436,
|
|
1372,1308,1244,1180,1116,1052,988,924,
|
|
876,844,812,780,748,716,684,652,
|
|
620,588,556,524,492,460,428,396,
|
|
372,356,340,324,308,292,276,260,
|
|
244,228,212,196,180,164,148,132,
|
|
120,112,104,96,88,80,72,64,
|
|
56,48,40,32,24,16,8,0
|
|
], dtype=np.int16)
|
|
|
|
def decode_ulaw(ulaw_bytes):
|
|
return ULAW_DECODE_TABLE[np.frombuffer(ulaw_bytes, dtype=np.uint8)]
|
|
|
|
print(f"Step 1: Greeting ESP32 at {ESP32_IP} to enable mic...")
|
|
|
|
# Send greeting via TCP to set mic_timeout
|
|
try:
|
|
tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
tcp_sock.settimeout(5.0)
|
|
tcp_sock.connect((ESP32_IP, 3001))
|
|
|
|
# Send greeting: 0xAA header + 60 second timeout
|
|
header = bytearray(6)
|
|
header[0] = 0xAA # Audio command
|
|
header[1] = 0x00 # Timeout high byte
|
|
header[2] = 60 # Timeout low byte (60 seconds)
|
|
header[3] = 14 # Speaker volume
|
|
header[4] = 5 # LED fade
|
|
header[5] = 0 # Unused
|
|
|
|
tcp_sock.send(header)
|
|
tcp_sock.close()
|
|
print(f"✓ Greeted ESP32, mic enabled for 60 seconds")
|
|
except Exception as e:
|
|
print(f"✗ Failed to greet ESP32: {e}")
|
|
exit(1)
|
|
|
|
time.sleep(1)
|
|
|
|
print(f"\nStep 3: Recording audio from UDP port {UDP_PORT}...")
|
|
print("Talk now!\n")
|
|
|
|
# Record UDP audio
|
|
udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
udp_sock.bind(('0.0.0.0', UDP_PORT))
|
|
udp_sock.settimeout(1.0)
|
|
|
|
audio_frames = []
|
|
packet_count = 0
|
|
start_time = time.time()
|
|
duration = 10
|
|
|
|
try:
|
|
while (time.time() - start_time) < duration:
|
|
try:
|
|
data, addr = udp_sock.recvfrom(2048)
|
|
packet_count += 1
|
|
|
|
# Auto-detect compression
|
|
if len(data) == CHUNK_SIZE:
|
|
samples = decode_ulaw(data)
|
|
mode = "μ-law"
|
|
elif len(data) == CHUNK_SIZE * 2:
|
|
samples = np.frombuffer(data, dtype=np.int16)
|
|
mode = "raw"
|
|
else:
|
|
continue
|
|
|
|
audio_frames.append(samples)
|
|
|
|
# Progress indicator
|
|
if packet_count % 10 == 0:
|
|
elapsed = time.time() - start_time
|
|
rms = np.sqrt(np.mean(samples.astype(np.float32) ** 2))
|
|
print(f"[{elapsed:4.1f}s] Packets: {packet_count:3d} | RMS: {rms:5.0f} | Mode: {mode}", end='\r', flush=True)
|
|
|
|
except socket.timeout:
|
|
continue
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nStopped by user")
|
|
|
|
print(f"\n\nRecording complete!")
|
|
print(f"Packets received: {packet_count}")
|
|
|
|
if audio_frames:
|
|
audio_data = np.concatenate(audio_frames)
|
|
output_file = 'recording.wav'
|
|
|
|
with wave.open(output_file, 'wb') as wav_file:
|
|
wav_file.setnchannels(1)
|
|
wav_file.setsampwidth(2)
|
|
wav_file.setframerate(16000)
|
|
wav_file.writeframes(audio_data.tobytes())
|
|
|
|
duration_sec = len(audio_data) / 16000
|
|
print(f"Saved {duration_sec:.1f}s to {output_file}")
|
|
else:
|
|
print("No audio received!")
|