mirror of
https://github.com/justLV/onju-v2
synced 2026-04-21 15:47:55 +00:00
Switch from webrtcvad's binary is_speech to Silero VAD's calibrated float probability via direct ONNX session calls with numpy. The LSTM provides temporal smoothing natively, eliminating the sliding window hack. Frame size changes from 480 (30ms) to 512 (32ms) end-to-end to match Silero's requirements. Consolidate pipeline/requirements.txt into root requirements.txt, swap webrtcvad+setuptools for silero-vad+onnxruntime.
182 lines
6.8 KiB
Python
182 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ESP32 Mic UDP Test Receiver
|
|
Receives audio from ESP32, detects compression, saves to WAV
|
|
"""
|
|
|
|
import socket
|
|
import struct
|
|
import wave
|
|
import time
|
|
import argparse
|
|
from collections import deque
|
|
import numpy as np
|
|
|
|
SAMPLE_RATE = 16000
|
|
CHUNK_SIZE = 512
|
|
CHUNK_BYTES_RAW = CHUNK_SIZE * 2 # int16 = 2 bytes
|
|
CHUNK_BYTES_COMPRESSED = CHUNK_SIZE # μ-law = 1 byte
|
|
|
|
# μ-law lookup table for decoding (same as ESP32)
|
|
ULAW_DECODE_TABLE = np.array([
|
|
-32124,-31100,-30076,-29052,-28028,-27004,-25980,-24956,
|
|
-23932,-22908,-21884,-20860,-19836,-18812,-17788,-16764,
|
|
-15996,-15484,-14972,-14460,-13948,-13436,-12924,-12412,
|
|
-11900,-11388,-10876,-10364,-9852,-9340,-8828,-8316,
|
|
-7932,-7676,-7420,-7164,-6908,-6652,-6396,-6140,
|
|
-5884,-5628,-5372,-5116,-4860,-4604,-4348,-4092,
|
|
-3900,-3772,-3644,-3516,-3388,-3260,-3132,-3004,
|
|
-2876,-2748,-2620,-2492,-2364,-2236,-2108,-1980,
|
|
-1884,-1820,-1756,-1692,-1628,-1564,-1500,-1436,
|
|
-1372,-1308,-1244,-1180,-1116,-1052,-988,-924,
|
|
-876,-844,-812,-780,-748,-716,-684,-652,
|
|
-620,-588,-556,-524,-492,-460,-428,-396,
|
|
-372,-356,-340,-324,-308,-292,-276,-260,
|
|
-244,-228,-212,-196,-180,-164,-148,-132,
|
|
-120,-112,-104,-96,-88,-80,-72,-64,
|
|
-56,-48,-40,-32,-24,-16,-8,0,
|
|
32124,31100,30076,29052,28028,27004,25980,24956,
|
|
23932,22908,21884,20860,19836,18812,17788,16764,
|
|
15996,15484,14972,14460,13948,13436,12924,12412,
|
|
11900,11388,10876,10364,9852,9340,8828,8316,
|
|
7932,7676,7420,7164,6908,6652,6396,6140,
|
|
5884,5628,5372,5116,4860,4604,4348,4092,
|
|
3900,3772,3644,3516,3388,3260,3132,3004,
|
|
2876,2748,2620,2492,2364,2236,2108,1980,
|
|
1884,1820,1756,1692,1628,1564,1500,1436,
|
|
1372,1308,1244,1180,1116,1052,988,924,
|
|
876,844,812,780,748,716,684,652,
|
|
620,588,556,524,492,460,428,396,
|
|
372,356,340,324,308,292,276,260,
|
|
244,228,212,196,180,164,148,132,
|
|
120,112,104,96,88,80,72,64,
|
|
56,48,40,32,24,16,8,0
|
|
], dtype=np.int16)
|
|
|
|
def decode_ulaw(ulaw_bytes):
|
|
"""Decode μ-law compressed audio to int16 PCM"""
|
|
return ULAW_DECODE_TABLE[np.frombuffer(ulaw_bytes, dtype=np.uint8)]
|
|
|
|
def calculate_rms(samples):
|
|
"""Calculate RMS of audio samples"""
|
|
return np.sqrt(np.mean(samples.astype(np.float32) ** 2))
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='ESP32 mic UDP receiver')
|
|
parser.add_argument('--port', type=int, default=3000, help='UDP port (default: 3000)')
|
|
parser.add_argument('--output', type=str, default='test_recording.wav', help='Output WAV file')
|
|
parser.add_argument('--duration', type=int, default=10, help='Recording duration in seconds (0 = infinite)')
|
|
parser.add_argument('--compressed', action='store_true', help='Expect μ-law compressed audio')
|
|
args = parser.parse_args()
|
|
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
sock.bind(('0.0.0.0', args.port))
|
|
sock.settimeout(1.0)
|
|
|
|
print(f"Listening on UDP port {args.port}...")
|
|
print(f"Expected format: {'μ-law compressed' if args.compressed else 'raw int16 PCM'}")
|
|
print(f"Output file: {args.output}")
|
|
print("Waiting for packets...\n")
|
|
|
|
audio_frames = []
|
|
packet_count = 0
|
|
total_bytes = 0
|
|
start_time = None
|
|
last_stats_time = None
|
|
rms_history = deque(maxlen=20)
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
data, addr = sock.recvfrom(2048)
|
|
|
|
if start_time is None:
|
|
start_time = time.time()
|
|
last_stats_time = start_time
|
|
print(f"Connected to ESP32: {addr[0]}:{addr[1]}\n")
|
|
|
|
packet_count += 1
|
|
total_bytes += len(data)
|
|
|
|
# Auto-detect compression based on packet size
|
|
if len(data) == CHUNK_BYTES_COMPRESSED:
|
|
# Compressed μ-law
|
|
samples = decode_ulaw(data)
|
|
is_compressed = True
|
|
elif len(data) == CHUNK_BYTES_RAW:
|
|
# Raw PCM int16
|
|
samples = np.frombuffer(data, dtype=np.int16)
|
|
is_compressed = False
|
|
else:
|
|
print(f"Warning: Unexpected packet size {len(data)} bytes (expected {CHUNK_BYTES_COMPRESSED} or {CHUNK_BYTES_RAW})")
|
|
continue
|
|
|
|
audio_frames.append(samples)
|
|
rms = calculate_rms(samples)
|
|
rms_history.append(rms)
|
|
|
|
# Print stats every second
|
|
current_time = time.time()
|
|
if current_time - last_stats_time >= 1.0:
|
|
elapsed = current_time - start_time
|
|
bandwidth_kbps = (total_bytes * 8) / (elapsed * 1000)
|
|
avg_rms = np.mean(list(rms_history))
|
|
|
|
print(f"\r[{elapsed:6.1f}s] Packets: {packet_count:4d} | "
|
|
f"Bandwidth: {bandwidth_kbps:5.1f} kbps | "
|
|
f"RMS: {avg_rms:6.0f} | "
|
|
f"Mode: {'μ-law' if is_compressed else 'raw '}",
|
|
end='', flush=True)
|
|
|
|
last_stats_time = current_time
|
|
|
|
# Check duration
|
|
if args.duration > 0 and (current_time - start_time) >= args.duration:
|
|
print("\n\nRecording duration reached.")
|
|
break
|
|
|
|
except socket.timeout:
|
|
if start_time is not None:
|
|
print("\n\nTimeout - no packets received for 1 second")
|
|
break
|
|
continue
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nInterrupted by user")
|
|
|
|
if not audio_frames:
|
|
print("No audio received!")
|
|
return
|
|
|
|
# Combine all frames
|
|
print(f"\n\nSaving {len(audio_frames)} chunks to {args.output}...")
|
|
audio_data = np.concatenate(audio_frames)
|
|
|
|
# Save to WAV
|
|
with wave.open(args.output, 'wb') as wav_file:
|
|
wav_file.setnchannels(1)
|
|
wav_file.setsampwidth(2)
|
|
wav_file.setframerate(SAMPLE_RATE)
|
|
wav_file.writeframes(audio_data.tobytes())
|
|
|
|
# Final stats
|
|
duration = len(audio_data) / SAMPLE_RATE
|
|
file_size_kb = len(audio_data) * 2 / 1024
|
|
transmitted_kb = total_bytes / 1024
|
|
compression_ratio = transmitted_kb / file_size_kb if file_size_kb > 0 else 1.0
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Recording complete!")
|
|
print(f"{'='*60}")
|
|
print(f"Duration: {duration:.2f} seconds")
|
|
print(f"Samples: {len(audio_data):,}")
|
|
print(f"WAV file size: {file_size_kb:.1f} KB")
|
|
print(f"Bytes transmitted: {transmitted_kb:.1f} KB")
|
|
print(f"Compression ratio: {compression_ratio:.2f}x")
|
|
print(f"Average bandwidth: {(total_bytes * 8) / (duration * 1000):.1f} kbps")
|
|
print(f"Packets received: {packet_count}")
|
|
print(f"Packet loss: {(1 - packet_count / (duration / 0.03)) * 100:.1f}%")
|
|
print(f"{'='*60}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|