LocalAI/backend/python/tinygrad/tool_parsers/llama3_json.py

"""Llama 3.1 / 3.2 / 3.3 JSON tool-call parser.

Meta's Llama 3.1+ instruct chat templates emit tool calls in two broadly
compatible shapes:

  1. With the `<|python_tag|>` lead-in:
        <|python_tag|>{"name": "get_weather", "parameters": {"city": "Paris"}}
  2. As a bare JSON object (or list of objects) at the end of the turn.

We also handle multi-call shapes where the model emits several JSON objects
separated by `;` or newlines, and JSON arrays `[{...}, {...}]`. The key field
for Llama 3 is historically `parameters` (older docs) but recent checkpoints
also emit `arguments` — accept either.
"""
from __future__ import annotations

import json
import re
from dataclasses import dataclass

from .base import ToolCall, ToolParser, register

_PYTHON_TAG = "<|python_tag|>"
_JSON_OBJECT_RE = re.compile(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", re.DOTALL)


def _coerce_call(obj: object, index: int) -> ToolCall | None:
    if not isinstance(obj, dict):
        return None
    name = obj.get("name")
    if not isinstance(name, str):
        return None
    args = obj.get("arguments", obj.get("parameters", {}))
    args_str = args if isinstance(args, str) else json.dumps(args, ensure_ascii=False)
    return ToolCall(index=index, name=name, arguments=args_str)


@register
class Llama3JsonToolParser(ToolParser):
    name = "llama3_json"

    def parse(self, text: str) -> tuple[str, list[ToolCall]]:
        calls: list[ToolCall] = []

        # Strip <|python_tag|> segments first — each segment is one tool call
        # body. The content after the final python_tag (if any) is the call.
        remaining = text
        if _PYTHON_TAG in text:
            head, *tails = text.split(_PYTHON_TAG)
            remaining = head
            for tail in tails:
                parsed = _try_parse(tail.strip(), len(calls))
                calls.extend(parsed)

        # Any JSON objects / arrays left in `remaining` count as tool calls too
        # if they parse to a {"name": ..., "arguments": ...} shape.
        for match in _JSON_OBJECT_RE.finditer(remaining):
            parsed = _try_parse(match.group(0), len(calls))
            if parsed:
                calls.extend(parsed)
                remaining = remaining.replace(match.group(0), "", 1)

        content = remaining.strip()
        return content, calls


def _try_parse(blob: str, start_index: int) -> list[ToolCall]:
    """Parse a fragment that may be a JSON object or a JSON array of objects."""
    blob = blob.strip().rstrip(";")
    if not blob:
        return []
    try:
        obj = json.loads(blob)
    except json.JSONDecodeError:
        return []
    if isinstance(obj, dict):
        call = _coerce_call(obj, start_index)
        return [call] if call else []
    if isinstance(obj, list):
        calls: list[ToolCall] = []
        for i, item in enumerate(obj):
            c = _coerce_call(item, start_index + i)
            if c:
                calls.append(c)
        return calls
    return []
feat(backend): add tinygrad multimodal backend (experimental) (#9364) * feat(backend): add tinygrad multimodal backend Wire tinygrad as a new Python backend covering LLM text generation with native tool-call extraction, embeddings, Stable Diffusion 1.x image generation, and Whisper speech-to-text from a single self-contained container. Backend (`backend/python/tinygrad/`): - `backend.py` gRPC servicer with LLM Predict/PredictStream (auto-detects Llama / Qwen2 / Mistral architecture from `config.json`, supports safetensors and GGUF), Embedding via mean-pooled last hidden state, GenerateImage via the vendored SD1.x pipeline, AudioTranscription + AudioTranscriptionStream via the vendored Whisper inference loop, plus Tokenize / ModelMetadata / Status / Free. - Vendored upstream model code under `vendor/` (MIT, headers preserved): llama.py with an added `qkv_bias` flag for Qwen2-family bias support and an `embed()` method that returns the last hidden state, plus clip.py, unet.py, stable_diffusion.py (trimmed to drop the MLPerf training branch that pulls `mlperf.initializers`), audio_helpers.py and whisper.py (trimmed to drop the pyaudio listener). - Pluggable tool-call parsers under `tool_parsers/`: hermes (Qwen2.5 / Hermes), llama3_json (Llama 3.1+), qwen3_xml (Qwen 3), mistral (Mistral / Mixtral). Auto-selected from model architecture or `Options`. - `install.sh` pins Python 3.11.14 (tinygrad >=0.12 needs >=3.11; the default portable python is 3.10). - `package.sh` bundles libLLVM.so.1 + libedit/libtinfo/libgomp/libsndfile into the scratch image. `run.sh` sets `CPU_LLVM=1` and `LLVM_PATH` so tinygrad's CPU device uses the in-process libLLVM JIT instead of shelling out to the missing `clang` binary. - Local unit tests for Health and the four parsers in `test.py`. Build wiring: - Root `Makefile`: `.NOTPARALLEL`, `prepare-test-extra`, `test-extra`, `BACKEND_TINYGRAD = tinygrad\|python\|.\|false\|true`, docker-build-target eval, and `docker-build-backends` aggregator. - `.github/workflows/backend.yml`: cpu / cuda12 / cuda13 build matrix entries (mirrors the transformers backend placement). - `backend/index.yaml`: `&tinygrad` meta + cpu/cuda12/cuda13 image entries (latest + development). E2E test wiring: - `tests/e2e-backends/backend_test.go` gains an `image` capability that exercises GenerateImage and asserts a non-empty PNG is written to `dst`. New `BACKEND_TEST_IMAGE_PROMPT` / `BACKEND_TEST_IMAGE_STEPS` knobs. - Five new make targets next to `test-extra-backend-vllm`: - `test-extra-backend-tinygrad` — Qwen2.5-0.5B-Instruct + hermes, mirrors the vllm target 1:1 (5/9 specs in ~57s). - `test-extra-backend-tinygrad-embeddings` — same model, embeddings via LLM hidden state (3/9 in ~10s). - `test-extra-backend-tinygrad-sd` — stable-diffusion-v1-5 mirror, health/load/image (3/9 in ~10min, 4 diffusion steps on CPU). - `test-extra-backend-tinygrad-whisper` — openai/whisper-tiny.en against jfk.wav from whisper.cpp samples (4/9 in ~49s). - `test-extra-backend-tinygrad-all` aggregate. All four targets land green on the first MVP pass: 15 specs total, 0 failures across LLM+tools, embeddings, image generation, and speech transcription. * refactor(tinygrad): collapse to a single backend image tinygrad generates its own GPU kernels (PTX renderer for CUDA, the autogen ctypes wrappers for HIP / Metal / WebGPU) and never links against cuDNN, cuBLAS, or any toolkit-version-tied library. The only runtime dependency that varies across hosts is the driver's libcuda.so.1 / libamdhip64.so, which are injected into the container at run time by the nvidia-container / rocm runtimes. So unlike torch- or vLLM-based backends, there is no reason to ship per-CUDA-version images. - Drop the cuda12-tinygrad and cuda13-tinygrad build-matrix entries from .github/workflows/backend.yml. The sole remaining entry is renamed to -tinygrad (from -cpu-tinygrad) since it is no longer CPU-only. - Collapse backend/index.yaml to a single meta + development pair. The meta anchor carries the latest uri directly; the development entry points at the master tag. - run.sh picks the tinygrad device at launch time by probing /usr/lib/... for libcuda.so.1 / libamdhip64.so. When libcuda is visible we set CUDA=1 + CUDA_PTX=1 so tinygrad uses its own PTX renderer (avoids any nvrtc/toolkit dependency); otherwise we fall back to HIP or CLANG. CPU_LLVM=1 + LLVM_PATH keep the in-process libLLVM JIT for the CLANG path. - backend.py's _select_tinygrad_device() is trimmed to a CLANG-only fallback since production device selection happens in run.sh. Re-ran test-extra-backend-tinygrad after the change: Ran 5 of 9 Specs in 56.541 seconds — 5 Passed, 0 Failed 2026-04-15 17:48:23 +00:00			`"""Llama 3.1 / 3.2 / 3.3 JSON tool-call parser.`

			`Meta's Llama 3.1+ instruct chat templates emit tool calls in two broadly`
			`compatible shapes:`

			1. With the `<\|python_tag\|>` lead-in:
			`<\|python_tag\|>{"name": "get_weather", "parameters": {"city": "Paris"}}`
			`2. As a bare JSON object (or list of objects) at the end of the turn.`

			`We also handle multi-call shapes where the model emits several JSON objects`
			separated by `;` or newlines, and JSON arrays `[{...}, {...}]`. The key field
			for Llama 3 is historically `parameters` (older docs) but recent checkpoints
			also emit `arguments` — accept either.
			`"""`
			`from __future__ import annotations`

			`import json`
			`import re`
			`from dataclasses import dataclass`

			`from .base import ToolCall, ToolParser, register`

			`_PYTHON_TAG = "<\|python_tag\|>"`
			`_JSON_OBJECT_RE = re.compile(r"\{[^{}](?:\{[^{}]\}[^{}])\}", re.DOTALL)`


			`def _coerce_call(obj: object, index: int) -> ToolCall \| None:`
			`if not isinstance(obj, dict):`
			`return None`
			`name = obj.get("name")`
			`if not isinstance(name, str):`
			`return None`
			`args = obj.get("arguments", obj.get("parameters", {}))`
			`args_str = args if isinstance(args, str) else json.dumps(args, ensure_ascii=False)`
			`return ToolCall(index=index, name=name, arguments=args_str)`


			`@register`
			`class Llama3JsonToolParser(ToolParser):`
			`name = "llama3_json"`

			`def parse(self, text: str) -> tuple[str, list[ToolCall]]:`
			`calls: list[ToolCall] = []`

			`# Strip <\|python_tag\|> segments first — each segment is one tool call`
			`# body. The content after the final python_tag (if any) is the call.`
			`remaining = text`
			`if _PYTHON_TAG in text:`
			`head, *tails = text.split(_PYTHON_TAG)`
			`remaining = head`
			`for tail in tails:`
			`parsed = _try_parse(tail.strip(), len(calls))`
			`calls.extend(parsed)`

			# Any JSON objects / arrays left in `remaining` count as tool calls too
			`# if they parse to a {"name": ..., "arguments": ...} shape.`
			`for match in _JSON_OBJECT_RE.finditer(remaining):`
			`parsed = _try_parse(match.group(0), len(calls))`
			`if parsed:`
			`calls.extend(parsed)`
			`remaining = remaining.replace(match.group(0), "", 1)`

			`content = remaining.strip()`
			`return content, calls`


			`def _try_parse(blob: str, start_index: int) -> list[ToolCall]:`
			`"""Parse a fragment that may be a JSON object or a JSON array of objects."""`
			`blob = blob.strip().rstrip(";")`
			`if not blob:`
			`return []`
			`try:`
			`obj = json.loads(blob)`
			`except json.JSONDecodeError:`
			`return []`
			`if isinstance(obj, dict):`
			`call = _coerce_call(obj, start_index)`
			`return [call] if call else []`
			`if isinstance(obj, list):`
			`calls: list[ToolCall] = []`
			`for i, item in enumerate(obj):`
			`c = _coerce_call(item, start_index + i)`
			`if c:`
			`calls.append(c)`
			`return calls`
			`return []`