LocalAI/backend/python/tinygrad/vendor/audio_helpers.py

# Vendored verbatim from tinygrad examples/audio_helpers.py (MIT license).
# Upstream: https://github.com/tinygrad/tinygrad/blob/master/examples/audio_helpers.py
# Copyright (c) 2023- the tinygrad authors
# SPDX-License-Identifier: MIT
from typing import Optional
from tinygrad import Tensor
from tinygrad.dtype import DTypeLike, dtypes
import math

# rewritten from numpy
def rfftfreq(n: int, d: float = 1.0, device=None) -> Tensor:
  val = 1.0 / (n * d)
  N = n // 2 + 1
  results = Tensor.arange(N, device=device)
  return results * val

# just like in librosa
def fft_frequencies(sr: float, n_fft: int) -> Tensor:
  return rfftfreq(n=n_fft, d=1.0 / sr)

def hz_to_mel(freq: Tensor) -> Tensor:
  # linear part
  f_min = 0.0
  f_sp = 200.0 / 3
  mels = (freq - f_min) / f_sp

  # log-scale part
  min_log_hz = 1000.0  # beginning of log region (Hz)
  mask = freq >= min_log_hz
  return mask.where(((min_log_hz - f_min) / f_sp) + (freq / min_log_hz).log() / (math.log(6.4) / 27.0), mels)

def mel_to_hz(mels: Tensor) -> Tensor:
  # linear scale
  f_min = 0.0
  f_sp = 200.0 / 3
  freqs = f_min + f_sp * mels

  # nonlinear scale
  min_log_hz = 1000.0  # beginning of log region (Hz)
  min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
  logstep = math.log(6.4) / 27.0  # step size for log region

  log_t = mels >= min_log_mel
  freqs = log_t.where(min_log_hz * ((logstep * (mels - min_log_mel)).exp()), freqs)
  return freqs

def mel_frequencies(n_mels: int = 128, *, fmin: float = 0.0, fmax: float = 11025.0) -> Tensor:
  # center freqs of mel bands - uniformly spaced between limits
  min_max_mel = hz_to_mel(Tensor([fmin, fmax]))

  mels = Tensor.linspace(min_max_mel[0], min_max_mel[1], n_mels)
  hz = mel_to_hz(mels)
  return hz

def mel(
  *,
  sr: float,
  n_fft: int,
  n_mels: int = 128,
  fmin: float = 0.0,
  fmax: Optional[float] = None,
  dtype: DTypeLike = dtypes.default_float,
) -> Tensor:
  if fmax is None:
    fmax = float(sr) / 2

  n_mels = int(n_mels)

  fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)  # center freqs of each FFT bin
  mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax)  # center freqs of mel bands

  fdiff = mel_f[1:] - mel_f[:-1]
  ramps = mel_f[None].T.expand(-1, fftfreqs.shape[-1]) - fftfreqs

  lower = -ramps[:n_mels] / fdiff[:n_mels][None].T
  upper = ramps[2 : n_mels + 2] / fdiff[1 : n_mels + 1][None].T
  weights = lower.minimum(upper).maximum(0)

  # Slaney-style mel is scaled to be approx constant energy per channel
  enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
  weights *= enorm[:, None]

  return weights
feat(backend): add tinygrad multimodal backend (experimental) (#9364) * feat(backend): add tinygrad multimodal backend Wire tinygrad as a new Python backend covering LLM text generation with native tool-call extraction, embeddings, Stable Diffusion 1.x image generation, and Whisper speech-to-text from a single self-contained container. Backend (`backend/python/tinygrad/`): - `backend.py` gRPC servicer with LLM Predict/PredictStream (auto-detects Llama / Qwen2 / Mistral architecture from `config.json`, supports safetensors and GGUF), Embedding via mean-pooled last hidden state, GenerateImage via the vendored SD1.x pipeline, AudioTranscription + AudioTranscriptionStream via the vendored Whisper inference loop, plus Tokenize / ModelMetadata / Status / Free. - Vendored upstream model code under `vendor/` (MIT, headers preserved): llama.py with an added `qkv_bias` flag for Qwen2-family bias support and an `embed()` method that returns the last hidden state, plus clip.py, unet.py, stable_diffusion.py (trimmed to drop the MLPerf training branch that pulls `mlperf.initializers`), audio_helpers.py and whisper.py (trimmed to drop the pyaudio listener). - Pluggable tool-call parsers under `tool_parsers/`: hermes (Qwen2.5 / Hermes), llama3_json (Llama 3.1+), qwen3_xml (Qwen 3), mistral (Mistral / Mixtral). Auto-selected from model architecture or `Options`. - `install.sh` pins Python 3.11.14 (tinygrad >=0.12 needs >=3.11; the default portable python is 3.10). - `package.sh` bundles libLLVM.so.1 + libedit/libtinfo/libgomp/libsndfile into the scratch image. `run.sh` sets `CPU_LLVM=1` and `LLVM_PATH` so tinygrad's CPU device uses the in-process libLLVM JIT instead of shelling out to the missing `clang` binary. - Local unit tests for Health and the four parsers in `test.py`. Build wiring: - Root `Makefile`: `.NOTPARALLEL`, `prepare-test-extra`, `test-extra`, `BACKEND_TINYGRAD = tinygrad\|python\|.\|false\|true`, docker-build-target eval, and `docker-build-backends` aggregator. - `.github/workflows/backend.yml`: cpu / cuda12 / cuda13 build matrix entries (mirrors the transformers backend placement). - `backend/index.yaml`: `&tinygrad` meta + cpu/cuda12/cuda13 image entries (latest + development). E2E test wiring: - `tests/e2e-backends/backend_test.go` gains an `image` capability that exercises GenerateImage and asserts a non-empty PNG is written to `dst`. New `BACKEND_TEST_IMAGE_PROMPT` / `BACKEND_TEST_IMAGE_STEPS` knobs. - Five new make targets next to `test-extra-backend-vllm`: - `test-extra-backend-tinygrad` — Qwen2.5-0.5B-Instruct + hermes, mirrors the vllm target 1:1 (5/9 specs in ~57s). - `test-extra-backend-tinygrad-embeddings` — same model, embeddings via LLM hidden state (3/9 in ~10s). - `test-extra-backend-tinygrad-sd` — stable-diffusion-v1-5 mirror, health/load/image (3/9 in ~10min, 4 diffusion steps on CPU). - `test-extra-backend-tinygrad-whisper` — openai/whisper-tiny.en against jfk.wav from whisper.cpp samples (4/9 in ~49s). - `test-extra-backend-tinygrad-all` aggregate. All four targets land green on the first MVP pass: 15 specs total, 0 failures across LLM+tools, embeddings, image generation, and speech transcription. * refactor(tinygrad): collapse to a single backend image tinygrad generates its own GPU kernels (PTX renderer for CUDA, the autogen ctypes wrappers for HIP / Metal / WebGPU) and never links against cuDNN, cuBLAS, or any toolkit-version-tied library. The only runtime dependency that varies across hosts is the driver's libcuda.so.1 / libamdhip64.so, which are injected into the container at run time by the nvidia-container / rocm runtimes. So unlike torch- or vLLM-based backends, there is no reason to ship per-CUDA-version images. - Drop the cuda12-tinygrad and cuda13-tinygrad build-matrix entries from .github/workflows/backend.yml. The sole remaining entry is renamed to -tinygrad (from -cpu-tinygrad) since it is no longer CPU-only. - Collapse backend/index.yaml to a single meta + development pair. The meta anchor carries the latest uri directly; the development entry points at the master tag. - run.sh picks the tinygrad device at launch time by probing /usr/lib/... for libcuda.so.1 / libamdhip64.so. When libcuda is visible we set CUDA=1 + CUDA_PTX=1 so tinygrad uses its own PTX renderer (avoids any nvrtc/toolkit dependency); otherwise we fall back to HIP or CLANG. CPU_LLVM=1 + LLVM_PATH keep the in-process libLLVM JIT for the CLANG path. - backend.py's _select_tinygrad_device() is trimmed to a CLANG-only fallback since production device selection happens in run.sh. Re-ran test-extra-backend-tinygrad after the change: Ran 5 of 9 Specs in 56.541 seconds — 5 Passed, 0 Failed 2026-04-15 17:48:23 +00:00			`# Vendored verbatim from tinygrad examples/audio_helpers.py (MIT license).`
			`# Upstream: https://github.com/tinygrad/tinygrad/blob/master/examples/audio_helpers.py`
			`# Copyright (c) 2023- the tinygrad authors`
			`# SPDX-License-Identifier: MIT`
			`from typing import Optional`
			`from tinygrad import Tensor`
			`from tinygrad.dtype import DTypeLike, dtypes`
			`import math`

			`# rewritten from numpy`
			`def rfftfreq(n: int, d: float = 1.0, device=None) -> Tensor:`
			`val = 1.0 / (n * d)`
			`N = n // 2 + 1`
			`results = Tensor.arange(N, device=device)`
			`return results * val`

			`# just like in librosa`
			`def fft_frequencies(sr: float, n_fft: int) -> Tensor:`
			`return rfftfreq(n=n_fft, d=1.0 / sr)`

			`def hz_to_mel(freq: Tensor) -> Tensor:`
			`# linear part`
			`f_min = 0.0`
			`f_sp = 200.0 / 3`
			`mels = (freq - f_min) / f_sp`

			`# log-scale part`
			`min_log_hz = 1000.0 # beginning of log region (Hz)`
			`mask = freq >= min_log_hz`
			`return mask.where(((min_log_hz - f_min) / f_sp) + (freq / min_log_hz).log() / (math.log(6.4) / 27.0), mels)`

			`def mel_to_hz(mels: Tensor) -> Tensor:`
			`# linear scale`
			`f_min = 0.0`
			`f_sp = 200.0 / 3`
			`freqs = f_min + f_sp * mels`

			`# nonlinear scale`
			`min_log_hz = 1000.0 # beginning of log region (Hz)`
			`min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)`
			`logstep = math.log(6.4) / 27.0 # step size for log region`

			`log_t = mels >= min_log_mel`
			`freqs = log_t.where(min_log_hz * ((logstep * (mels - min_log_mel)).exp()), freqs)`
			`return freqs`

			`def mel_frequencies(n_mels: int = 128, *, fmin: float = 0.0, fmax: float = 11025.0) -> Tensor:`
			`# center freqs of mel bands - uniformly spaced between limits`
			`min_max_mel = hz_to_mel(Tensor([fmin, fmax]))`

			`mels = Tensor.linspace(min_max_mel[0], min_max_mel[1], n_mels)`
			`hz = mel_to_hz(mels)`
			`return hz`

			`def mel(`
			`*,`
			`sr: float,`
			`n_fft: int,`
			`n_mels: int = 128,`
			`fmin: float = 0.0,`
			`fmax: Optional[float] = None,`
			`dtype: DTypeLike = dtypes.default_float,`
			`) -> Tensor:`
			`if fmax is None:`
			`fmax = float(sr) / 2`

			`n_mels = int(n_mels)`

			`fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) # center freqs of each FFT bin`
			`mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax) # center freqs of mel bands`

			`fdiff = mel_f[1:] - mel_f[:-1]`
			`ramps = mel_f[None].T.expand(-1, fftfreqs.shape[-1]) - fftfreqs`

			`lower = -ramps[:n_mels] / fdiff[:n_mels][None].T`
			`upper = ramps[2 : n_mels + 2] / fdiff[1 : n_mels + 1][None].T`
			`weights = lower.minimum(upper).maximum(0)`

			`# Slaney-style mel is scaled to be approx constant energy per channel`
			`enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])`
			`weights *= enorm[:, None]`

			`return weights`