onju-v2/test_mic_receiver.py
justLV 0c9c75b3bf Replace webrtcvad with Silero VAD (ONNX, no PyTorch)
Switch from webrtcvad's binary is_speech to Silero VAD's calibrated
float probability via direct ONNX session calls with numpy. The LSTM
provides temporal smoothing natively, eliminating the sliding window
hack. Frame size changes from 480 (30ms) to 512 (32ms) end-to-end
to match Silero's requirements.

Consolidate pipeline/requirements.txt into root requirements.txt,
swap webrtcvad+setuptools for silero-vad+onnxruntime.
2026-02-07 17:00:02 -08:00

182 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
ESP32 Mic UDP Test Receiver
Receives audio from ESP32, detects compression, saves to WAV
"""
import socket
import struct
import wave
import time
import argparse
from collections import deque
import numpy as np
SAMPLE_RATE = 16000
CHUNK_SIZE = 512
CHUNK_BYTES_RAW = CHUNK_SIZE * 2 # int16 = 2 bytes
CHUNK_BYTES_COMPRESSED = CHUNK_SIZE # μ-law = 1 byte
# μ-law lookup table for decoding (same as ESP32)
ULAW_DECODE_TABLE = np.array([
-32124,-31100,-30076,-29052,-28028,-27004,-25980,-24956,
-23932,-22908,-21884,-20860,-19836,-18812,-17788,-16764,
-15996,-15484,-14972,-14460,-13948,-13436,-12924,-12412,
-11900,-11388,-10876,-10364,-9852,-9340,-8828,-8316,
-7932,-7676,-7420,-7164,-6908,-6652,-6396,-6140,
-5884,-5628,-5372,-5116,-4860,-4604,-4348,-4092,
-3900,-3772,-3644,-3516,-3388,-3260,-3132,-3004,
-2876,-2748,-2620,-2492,-2364,-2236,-2108,-1980,
-1884,-1820,-1756,-1692,-1628,-1564,-1500,-1436,
-1372,-1308,-1244,-1180,-1116,-1052,-988,-924,
-876,-844,-812,-780,-748,-716,-684,-652,
-620,-588,-556,-524,-492,-460,-428,-396,
-372,-356,-340,-324,-308,-292,-276,-260,
-244,-228,-212,-196,-180,-164,-148,-132,
-120,-112,-104,-96,-88,-80,-72,-64,
-56,-48,-40,-32,-24,-16,-8,0,
32124,31100,30076,29052,28028,27004,25980,24956,
23932,22908,21884,20860,19836,18812,17788,16764,
15996,15484,14972,14460,13948,13436,12924,12412,
11900,11388,10876,10364,9852,9340,8828,8316,
7932,7676,7420,7164,6908,6652,6396,6140,
5884,5628,5372,5116,4860,4604,4348,4092,
3900,3772,3644,3516,3388,3260,3132,3004,
2876,2748,2620,2492,2364,2236,2108,1980,
1884,1820,1756,1692,1628,1564,1500,1436,
1372,1308,1244,1180,1116,1052,988,924,
876,844,812,780,748,716,684,652,
620,588,556,524,492,460,428,396,
372,356,340,324,308,292,276,260,
244,228,212,196,180,164,148,132,
120,112,104,96,88,80,72,64,
56,48,40,32,24,16,8,0
], dtype=np.int16)
def decode_ulaw(ulaw_bytes):
"""Decode μ-law compressed audio to int16 PCM"""
return ULAW_DECODE_TABLE[np.frombuffer(ulaw_bytes, dtype=np.uint8)]
def calculate_rms(samples):
"""Calculate RMS of audio samples"""
return np.sqrt(np.mean(samples.astype(np.float32) ** 2))
def main():
parser = argparse.ArgumentParser(description='ESP32 mic UDP receiver')
parser.add_argument('--port', type=int, default=3000, help='UDP port (default: 3000)')
parser.add_argument('--output', type=str, default='test_recording.wav', help='Output WAV file')
parser.add_argument('--duration', type=int, default=10, help='Recording duration in seconds (0 = infinite)')
parser.add_argument('--compressed', action='store_true', help='Expect μ-law compressed audio')
args = parser.parse_args()
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.bind(('0.0.0.0', args.port))
sock.settimeout(1.0)
print(f"Listening on UDP port {args.port}...")
print(f"Expected format: {'μ-law compressed' if args.compressed else 'raw int16 PCM'}")
print(f"Output file: {args.output}")
print("Waiting for packets...\n")
audio_frames = []
packet_count = 0
total_bytes = 0
start_time = None
last_stats_time = None
rms_history = deque(maxlen=20)
try:
while True:
try:
data, addr = sock.recvfrom(2048)
if start_time is None:
start_time = time.time()
last_stats_time = start_time
print(f"Connected to ESP32: {addr[0]}:{addr[1]}\n")
packet_count += 1
total_bytes += len(data)
# Auto-detect compression based on packet size
if len(data) == CHUNK_BYTES_COMPRESSED:
# Compressed μ-law
samples = decode_ulaw(data)
is_compressed = True
elif len(data) == CHUNK_BYTES_RAW:
# Raw PCM int16
samples = np.frombuffer(data, dtype=np.int16)
is_compressed = False
else:
print(f"Warning: Unexpected packet size {len(data)} bytes (expected {CHUNK_BYTES_COMPRESSED} or {CHUNK_BYTES_RAW})")
continue
audio_frames.append(samples)
rms = calculate_rms(samples)
rms_history.append(rms)
# Print stats every second
current_time = time.time()
if current_time - last_stats_time >= 1.0:
elapsed = current_time - start_time
bandwidth_kbps = (total_bytes * 8) / (elapsed * 1000)
avg_rms = np.mean(list(rms_history))
print(f"\r[{elapsed:6.1f}s] Packets: {packet_count:4d} | "
f"Bandwidth: {bandwidth_kbps:5.1f} kbps | "
f"RMS: {avg_rms:6.0f} | "
f"Mode: {'μ-law' if is_compressed else 'raw '}",
end='', flush=True)
last_stats_time = current_time
# Check duration
if args.duration > 0 and (current_time - start_time) >= args.duration:
print("\n\nRecording duration reached.")
break
except socket.timeout:
if start_time is not None:
print("\n\nTimeout - no packets received for 1 second")
break
continue
except KeyboardInterrupt:
print("\n\nInterrupted by user")
if not audio_frames:
print("No audio received!")
return
# Combine all frames
print(f"\n\nSaving {len(audio_frames)} chunks to {args.output}...")
audio_data = np.concatenate(audio_frames)
# Save to WAV
with wave.open(args.output, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(SAMPLE_RATE)
wav_file.writeframes(audio_data.tobytes())
# Final stats
duration = len(audio_data) / SAMPLE_RATE
file_size_kb = len(audio_data) * 2 / 1024
transmitted_kb = total_bytes / 1024
compression_ratio = transmitted_kb / file_size_kb if file_size_kb > 0 else 1.0
print(f"\n{'='*60}")
print(f"Recording complete!")
print(f"{'='*60}")
print(f"Duration: {duration:.2f} seconds")
print(f"Samples: {len(audio_data):,}")
print(f"WAV file size: {file_size_kb:.1f} KB")
print(f"Bytes transmitted: {transmitted_kb:.1f} KB")
print(f"Compression ratio: {compression_ratio:.2f}x")
print(f"Average bandwidth: {(total_bytes * 8) / (duration * 1000):.1f} kbps")
print(f"Packets received: {packet_count}")
print(f"Packet loss: {(1 - packet_count / (duration / 0.03)) * 100:.1f}%")
print(f"{'='*60}")
if __name__ == '__main__':
main()