LocalAI/backend/python/vllm/test.py

281 lines
11 KiB
Python
Raw Normal View History

import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
import unittest
import subprocess
import time
import grpc
import backend_pb2_grpc
import backend_pb2
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service.
This class contains methods to test the startup and shutdown of the gRPC service.
"""
def setUp(self):
self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
self.service.terminate()
self.service.wait()
def test_server_startup(self):
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_text(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(Prompt="The capital of France is")
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
except Exception as err:
print(err)
self.fail("text service failed")
finally:
self.tearDown()
def test_sampling_params(self):
"""
This method tests if all sampling parameters are correctly processed
NOTE: this does NOT test for correctness, just that we received a compatible response
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(
Prompt="The capital of France is",
TopP=0.8,
Tokens=50,
Temperature=0.7,
TopK=40,
PresencePenalty=0.1,
FrequencyPenalty=0.2,
RepetitionPenalty=1.1,
MinP=0.05,
Seed=42,
StopPrompts=["\n"],
StopTokenIds=[50256],
BadWords=["badword"],
IncludeStopStrInOutput=True,
IgnoreEOS=True,
MinTokens=5,
Logprobs=5,
PromptLogprobs=5,
SkipSpecialTokens=True,
SpacesBetweenSpecialTokens=True,
TruncatePromptTokens=10,
GuidedDecoding=True,
N=2,
)
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
self.assertIsNotNone(resp.logprobs)
except Exception as err:
print(err)
self.fail("sampling params service failed")
finally:
self.tearDown()
feat(vllm): parity with llama.cpp backend (#9328) * fix(schema): serialize ToolCallID and Reasoning in Messages.ToProto The ToProto conversion was dropping tool_call_id and reasoning_content even though both proto and Go fields existed, breaking multi-turn tool calling and reasoning passthrough to backends. * refactor(config): introduce backend hook system and migrate llama-cpp defaults Adds RegisterBackendHook/runBackendHooks so each backend can register default-filling functions that run during ModelConfig.SetDefaults(). Migrates the existing GGUF guessing logic into hooks_llamacpp.go, registered for both 'llama-cpp' and the empty backend (auto-detect). Removes the old guesser.go shim. * feat(config): add vLLM parser defaults hook and importer auto-detection Introduces parser_defaults.json mapping model families to vLLM tool_parser/reasoning_parser names, with longest-pattern-first matching. The vllmDefaults hook auto-fills tool_parser and reasoning_parser options at load time for known families, while the VLLMImporter writes the same values into generated YAML so users can review and edit them. Adds tests covering MatchParserDefaults, hook registration via SetDefaults, and the user-override behavior. * feat(vllm): wire native tool/reasoning parsers + chat deltas + logprobs - Use vLLM's ToolParserManager/ReasoningParserManager to extract structured output (tool calls, reasoning content) instead of reimplementing parsing - Convert proto Messages to dicts and pass tools to apply_chat_template - Emit ChatDelta with content/reasoning_content/tool_calls in Reply - Extract prompt_tokens, completion_tokens, and logprobs from output - Replace boolean GuidedDecoding with proper GuidedDecodingParams from Grammar - Add TokenizeString and Free RPC methods - Fix missing `time` import used by load_video() * feat(vllm): CPU support + shared utils + vllm-omni feature parity - Split vllm install per acceleration: move generic `vllm` out of requirements-after.txt into per-profile after files (cublas12, hipblas, intel) and add CPU wheel URL for cpu-after.txt - requirements-cpu.txt now pulls torch==2.7.0+cpu from PyTorch CPU index - backend/index.yaml: register cpu-vllm / cpu-vllm-development variants - New backend/python/common/vllm_utils.py: shared parse_options, messages_to_dicts, setup_parsers helpers (used by both vllm backends) - vllm-omni: replace hardcoded chat template with tokenizer.apply_chat_template, wire native parsers via shared utils, emit ChatDelta with token counts, add TokenizeString and Free RPCs, detect CPU and set VLLM_TARGET_DEVICE - Add test_cpu_inference.py: standalone script to validate CPU build with a small model (Qwen2.5-0.5B-Instruct) * fix(vllm): CPU build compatibility with vllm 0.14.1 Validated end-to-end on CPU with Qwen2.5-0.5B-Instruct (LoadModel, Predict, TokenizeString, Free all working). - requirements-cpu-after.txt: pin vllm to 0.14.1+cpu (pre-built wheel from GitHub releases) for x86_64 and aarch64. vllm 0.14.1 is the newest CPU wheel whose torch dependency resolves against published PyTorch builds (torch==2.9.1+cpu). Later vllm CPU wheels currently require torch==2.10.0+cpu which is only available on the PyTorch test channel with incompatible torchvision. - requirements-cpu.txt: bump torch to 2.9.1+cpu, add torchvision/torchaudio so uv resolves them consistently from the PyTorch CPU index. - install.sh: add --index-strategy=unsafe-best-match for CPU builds so uv can mix the PyTorch index and PyPI for transitive deps (matches the existing intel profile behaviour). - backend.py LoadModel: vllm >= 0.14 removed AsyncLLMEngine.get_model_config so the old code path errored out with AttributeError on model load. Switch to the new get_tokenizer()/tokenizer accessor with a fallback to building the tokenizer directly from request.Model. * fix(vllm): tool parser constructor compat + e2e tool calling test Concrete vLLM tool parsers override the abstract base's __init__ and drop the tools kwarg (e.g. Hermes2ProToolParser only takes tokenizer). Instantiating with tools= raised TypeError which was silently caught, leaving chat_deltas.tool_calls empty. Retry the constructor without the tools kwarg on TypeError — tools aren't required by these parsers since extract_tool_calls finds tool syntax in the raw model output directly. Validated with Qwen/Qwen2.5-0.5B-Instruct + hermes parser on CPU: the backend correctly returns ToolCallDelta{name='get_weather', arguments='{"location": "Paris, France"}'} in ChatDelta. test_tool_calls.py is a standalone smoke test that spawns the gRPC backend, sends a chat completion with tools, and asserts the response contains a structured tool call. * ci(backend): build cpu-vllm container image Add the cpu-vllm variant to the backend container build matrix so the image registered in backend/index.yaml (cpu-vllm / cpu-vllm-development) is actually produced by CI. Follows the same pattern as the other CPU python backends (cpu-diffusers, cpu-chatterbox, etc.) with build-type='' and no CUDA. backend_pr.yml auto-picks this up via its matrix filter from backend.yml. * test(e2e-backends): add tools capability + HF model name support Extends tests/e2e-backends to cover backends that: - Resolve HuggingFace model ids natively (vllm, vllm-omni) instead of loading a local file: BACKEND_TEST_MODEL_NAME is passed verbatim as ModelOptions.Model with no download/ModelFile. - Parse tool calls into ChatDelta.tool_calls: new "tools" capability sends a Predict with a get_weather function definition and asserts the Reply contains a matching ToolCallDelta. Uses UseTokenizerTemplate with OpenAI-style Messages so the backend can wire tools into the model's chat template. - Need backend-specific Options[]: BACKEND_TEST_OPTIONS lets a test set e.g. "tool_parser:hermes,reasoning_parser:qwen3" at LoadModel time. Adds make target test-extra-backend-vllm that: - docker-build-vllm - loads Qwen/Qwen2.5-0.5B-Instruct - runs health,load,predict,stream,tools with tool_parser:hermes Drops backend/python/vllm/test_{cpu_inference,tool_calls}.py — those standalone scripts were scaffolding used while bringing up the Python backend; the e2e-backends harness now covers the same ground uniformly alongside llama-cpp and ik-llama-cpp. * ci(test-extra): run vllm e2e tests on CPU Adds tests-vllm-grpc to the test-extra workflow, mirroring the llama-cpp and ik-llama-cpp gRPC jobs. Triggers when files under backend/python/vllm/ change (or on run-all), builds the local-ai vllm container image, and runs the tests/e2e-backends harness with BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct, tool_parser:hermes, and the tools capability enabled. Uses ubuntu-latest (no GPU) — vllm runs on CPU via the cpu-vllm wheel we pinned in requirements-cpu-after.txt. Frees disk space before the build since the docker image + torch + vllm wheel is sizeable. * fix(vllm): build from source on CI to avoid SIGILL on prebuilt wheel The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns the model_executor.models.registry subprocess for introspection, so LoadModel never reaches the actual inference path. - install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide requirements-cpu-after.txt so installRequirements installs the base deps + torch CPU without pulling the prebuilt wheel, then clone vllm and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries target the host's actual CPU. - backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose it as an ENV so install.sh sees it during `make`. - Makefile docker-build-backend: forward FROM_SOURCE as --build-arg when set, so backends that need source builds can opt in. - Makefile test-extra-backend-vllm: call docker-build-vllm via a recursive $(MAKE) invocation so FROM_SOURCE flows through. - .github/workflows/test-extra.yml: set FROM_SOURCE=true on the tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only works on hosts that share the build-time SIMD baseline. Answers 'did you test locally?': yes, end-to-end on my local machine with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU gap was not covered locally — this commit plugs that gap. * ci(vllm): use bigger-runner instead of source build The prebuilt vllm 0.14.1+cpu wheel requires SIMD instructions (AVX-512 VNNI/BF16) that stock ubuntu-latest GitHub runners don't support — vllm.model_executor.models.registry SIGILLs on import during LoadModel. Source compilation works but takes 30-40 minutes per CI run, which is too slow for an e2e smoke test. Instead, switch tests-vllm-grpc to the bigger-runner self-hosted label (already used by backend.yml for the llama-cpp CUDA build) — that hardware has the required SIMD baseline and the prebuilt wheel runs cleanly. FROM_SOURCE=true is kept as an opt-in escape hatch: - install.sh still has the CPU source-build path for hosts that need it - backend/Dockerfile.python still declares the ARG + ENV - Makefile docker-build-backend still forwards the build-arg when set Default CI path uses the fast prebuilt wheel; source build can be re-enabled by exporting FROM_SOURCE=true in the environment. * ci(vllm): install make + build deps on bigger-runner bigger-runner is a bare self-hosted runner used by backend.yml for docker image builds — it has docker but not the usual ubuntu-latest toolchain. The make-based test target needs make, build-essential (cgo in 'go test'), and curl/unzip (the Makefile protoc target downloads protoc from github releases). protoc-gen-go and protoc-gen-go-grpc come via 'go install' in the install-go-tools target, which setup-go makes possible. * ci(vllm): install libnuma1 + libgomp1 on bigger-runner The vllm 0.14.1+cpu wheel ships a _C C++ extension that dlopens libnuma.so.1 at import time. When the runner host doesn't have it, the extension silently fails to register its torch ops, so EngineCore crashes on init_device with: AttributeError: '_OpNamespace' '_C_utils' object has no attribute 'init_cpu_threads_env' Also add libgomp1 (OpenMP runtime, used by torch CPU kernels) to be safe on stripped-down runners. * feat(vllm): bundle libnuma/libgomp via package.sh The vllm CPU wheel ships a _C extension that dlopens libnuma.so.1 at import time; torch's CPU kernels in turn use libgomp.so.1 (OpenMP). Without these on the host, vllm._C silently fails to register its torch ops and EngineCore crashes with: AttributeError: '_OpNamespace' '_C_utils' object has no attribute 'init_cpu_threads_env' Rather than asking every user to install libnuma1/libgomp1 on their host (or every LocalAI base image to ship them), bundle them into the backend image itself — same pattern fish-speech and the GPU libs already use. libbackend.sh adds ${EDIR}/lib to LD_LIBRARY_PATH at run time so the bundled copies are picked up automatically. - backend/python/vllm/package.sh (new): copies libnuma.so.1 and libgomp.so.1 from the builder's multilib paths into ${BACKEND}/lib, preserving soname symlinks. Runs during Dockerfile.python's 'Run backend-specific packaging' step (which already invokes package.sh if present). - backend/Dockerfile.python: install libnuma1 + libgomp1 in the builder stage so package.sh has something to copy (the Ubuntu base image otherwise only has libgomp in the gcc dep chain). - test-extra.yml: drop the workaround that installed these libs on the runner host — with the backend image self-contained, the runner no longer needs them, and the test now exercises the packaging path end-to-end the way a production host would. * ci(vllm): disable tests-vllm-grpc job (heterogeneous runners) Both ubuntu-latest and bigger-runner have inconsistent CPU baselines: some instances support the AVX-512 VNNI/BF16 instructions the prebuilt vllm 0.14.1+cpu wheel was compiled with, others SIGILL on import of vllm.model_executor.models.registry. The libnuma packaging fix doesn't help when the wheel itself can't be loaded. FROM_SOURCE=true compiles vllm against the actual host CPU and works everywhere, but takes 30-50 minutes per run — too slow for a smoke test on every PR. Comment out the job for now. The test itself is intact and passes locally; run it via 'make test-extra-backend-vllm' on a host with the required SIMD baseline. Re-enable when: - we have a self-hosted runner label with guaranteed AVX-512 VNNI/BF16, or - vllm publishes a CPU wheel with a wider baseline, or - we set up a docker layer cache that makes FROM_SOURCE acceptable The detect-changes vllm output, the test harness changes (tests/ e2e-backends + tools cap), the make target (test-extra-backend-vllm), the package.sh and the Dockerfile/install.sh plumbing all stay in place.
2026-04-13 09:00:29 +00:00
def test_messages_to_dicts(self):
"""
Tests _messages_to_dicts conversion of proto Messages to dicts.
"""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from backend import BackendServicer
servicer = BackendServicer()
msgs = [
backend_pb2.Message(role="user", content="hello"),
backend_pb2.Message(
role="assistant",
content="",
tool_calls='[{"id":"call_1","type":"function","function":{"name":"foo","arguments":"{}"}}]',
reasoning_content="thinking...",
),
backend_pb2.Message(role="tool", content="result", name="foo", tool_call_id="call_1"),
]
result = servicer._messages_to_dicts(msgs)
self.assertEqual(len(result), 3)
self.assertEqual(result[0], {"role": "user", "content": "hello"})
self.assertEqual(result[1]["reasoning_content"], "thinking...")
self.assertIsInstance(result[1]["tool_calls"], list)
self.assertEqual(result[1]["tool_calls"][0]["id"], "call_1")
self.assertEqual(result[2]["tool_call_id"], "call_1")
self.assertEqual(result[2]["name"], "foo")
def test_parse_options(self):
"""
Tests _parse_options correctly parses key:value strings.
"""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from backend import BackendServicer
servicer = BackendServicer()
opts = servicer._parse_options([
"tool_parser:hermes",
"reasoning_parser:deepseek_r1",
"invalid_no_colon",
"key_with_colons:a:b:c",
])
self.assertEqual(opts["tool_parser"], "hermes")
self.assertEqual(opts["reasoning_parser"], "deepseek_r1")
self.assertEqual(opts["key_with_colons"], "a:b:c")
self.assertNotIn("invalid_no_colon", opts)
feat(vllm): expose AsyncEngineArgs via generic engine_args YAML map (#9563) * feat(vllm): expose AsyncEngineArgs via generic engine_args YAML map LocalAI's vLLM backend wraps a small typed subset of vLLM's AsyncEngineArgs (quantization, tensor_parallel_size, dtype, etc.). Anything outside that subset -- pipeline/data/expert parallelism, speculative_config, kv_transfer_config, all2all_backend, prefix caching, chunked prefill, etc. -- requires a new protobuf field, a Go struct field, an options.go line, and a backend.py mapping per feature. That cadence is the bottleneck on shipping vLLM's production feature set. Add a generic `engine_args:` map on the model YAML that is JSON-serialised into a new ModelOptions.EngineArgs proto field and applied verbatim to AsyncEngineArgs at LoadModel time. Validation is done by the Python backend via dataclasses.fields(); unknown keys fail with the closest valid name as a hint. dataclasses.replace() is used so vLLM's __post_init__ re-runs and auto-converts dict values into nested config dataclasses (CompilationConfig, AttentionConfig, ...). speculative_config and kv_transfer_config flow through as dicts; vLLM converts them at engine init. Operators can now write: engine_args: data_parallel_size: 8 enable_expert_parallel: true all2all_backend: deepep_low_latency speculative_config: method: deepseek_mtp num_speculative_tokens: 3 kv_cache_dtype: fp8 without further proto/Go/Python plumbing per field. Production defaults seeded by hooks_vllm.go: enable_prefix_caching and enable_chunked_prefill default to true unless explicitly set. Existing typed YAML fields (gpu_memory_utilization, tensor_parallel_size, etc.) remain for back-compat; engine_args overrides them when both are set. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * chore(vllm): pin cublas13 to vLLM 0.20.0 cu130 wheel vLLM's PyPI wheel is built against CUDA 12 (libcudart.so.12) and won't load on a cu130 host. Switch the cublas13 build to vLLM's per-tag cu130 simple-index (https://wheels.vllm.ai/0.20.0/cu130/) and pin vllm==0.20.0. The cu130-flavoured wheel ships libcudart.so.13 and includes the DFlash speculative-decoding method that landed in 0.20.0. cublas13 install gets --index-strategy=unsafe-best-match so uv consults both the cu130 index and PyPI when resolving — PyPI also publishes vllm==0.20.0, but with cu12 binaries that error at import time. Verified: Qwen3.5-4B + z-lab/Qwen3.5-4B-DFlash loads and serves chat completions on RTX 5070 Ti (sm_120, cu130). Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * ci(vllm): bot job to bump cublas13 vLLM wheel pin vLLM's cu130 wheel index URL is itself version-locked (wheels.vllm.ai/<TAG>/cu130/, no /latest/ alias upstream), so a vLLM bump means rewriting two values atomically — the URL segment and the version constraint. bump_deps.sh handles git-sha-in-Makefile only; add a sibling bump_vllm_wheel.sh and a matching workflow job that mirrors the existing matrix's PR-creation pattern. The bumper queries /releases/latest (which excludes prereleases), strips the leading 'v', and seds both lines unconditionally. When the file is already on the latest tag the rewrite is a no-op and peter-evans/create-pull-request opens no PR. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> * docs(vllm): document engine_args and speculative decoding The new engine_args: map plumbs arbitrary AsyncEngineArgs through to vLLM, but the public docs only covered the basic typed fields. Add a short subsection in the vLLM section explaining the typed/generic split and showing a worked DFlash speculative-decoding config, with pointers to vLLM's SpeculativeConfig reference and z-lab's drafter collection. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-28 22:49:28 +00:00
def test_apply_engine_args_known_keys(self):
"""
Tests _apply_engine_args overlays user-supplied JSON onto AsyncEngineArgs.
"""
import sys, os, json as _json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from backend import BackendServicer
from vllm.engine.arg_utils import AsyncEngineArgs
servicer = BackendServicer()
base = AsyncEngineArgs(model="facebook/opt-125m")
extras = _json.dumps({
"trust_remote_code": True,
"max_num_seqs": 32,
})
out = servicer._apply_engine_args(base, extras)
self.assertTrue(out.trust_remote_code)
self.assertEqual(out.max_num_seqs, 32)
# untouched fields preserved
self.assertEqual(out.model, "facebook/opt-125m")
def test_apply_engine_args_unknown_key_raises(self):
"""
Tests _apply_engine_args rejects unknown keys with a helpful suggestion.
"""
import sys, os, json as _json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from backend import BackendServicer
from vllm.engine.arg_utils import AsyncEngineArgs
servicer = BackendServicer()
base = AsyncEngineArgs(model="facebook/opt-125m")
with self.assertRaises(ValueError) as ctx:
servicer._apply_engine_args(base, _json.dumps({"trustremotecode": True}))
self.assertIn("trustremotecode", str(ctx.exception))
# close-match hint for the typo
self.assertIn("trust_remote_code", str(ctx.exception))
def test_apply_engine_args_empty_passthrough(self):
"""
Tests that empty engine_args returns the base unchanged.
"""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from backend import BackendServicer
from vllm.engine.arg_utils import AsyncEngineArgs
servicer = BackendServicer()
base = AsyncEngineArgs(model="facebook/opt-125m")
self.assertIs(servicer._apply_engine_args(base, ""), base)
self.assertIs(servicer._apply_engine_args(base, None), base)
feat(vllm): parity with llama.cpp backend (#9328) * fix(schema): serialize ToolCallID and Reasoning in Messages.ToProto The ToProto conversion was dropping tool_call_id and reasoning_content even though both proto and Go fields existed, breaking multi-turn tool calling and reasoning passthrough to backends. * refactor(config): introduce backend hook system and migrate llama-cpp defaults Adds RegisterBackendHook/runBackendHooks so each backend can register default-filling functions that run during ModelConfig.SetDefaults(). Migrates the existing GGUF guessing logic into hooks_llamacpp.go, registered for both 'llama-cpp' and the empty backend (auto-detect). Removes the old guesser.go shim. * feat(config): add vLLM parser defaults hook and importer auto-detection Introduces parser_defaults.json mapping model families to vLLM tool_parser/reasoning_parser names, with longest-pattern-first matching. The vllmDefaults hook auto-fills tool_parser and reasoning_parser options at load time for known families, while the VLLMImporter writes the same values into generated YAML so users can review and edit them. Adds tests covering MatchParserDefaults, hook registration via SetDefaults, and the user-override behavior. * feat(vllm): wire native tool/reasoning parsers + chat deltas + logprobs - Use vLLM's ToolParserManager/ReasoningParserManager to extract structured output (tool calls, reasoning content) instead of reimplementing parsing - Convert proto Messages to dicts and pass tools to apply_chat_template - Emit ChatDelta with content/reasoning_content/tool_calls in Reply - Extract prompt_tokens, completion_tokens, and logprobs from output - Replace boolean GuidedDecoding with proper GuidedDecodingParams from Grammar - Add TokenizeString and Free RPC methods - Fix missing `time` import used by load_video() * feat(vllm): CPU support + shared utils + vllm-omni feature parity - Split vllm install per acceleration: move generic `vllm` out of requirements-after.txt into per-profile after files (cublas12, hipblas, intel) and add CPU wheel URL for cpu-after.txt - requirements-cpu.txt now pulls torch==2.7.0+cpu from PyTorch CPU index - backend/index.yaml: register cpu-vllm / cpu-vllm-development variants - New backend/python/common/vllm_utils.py: shared parse_options, messages_to_dicts, setup_parsers helpers (used by both vllm backends) - vllm-omni: replace hardcoded chat template with tokenizer.apply_chat_template, wire native parsers via shared utils, emit ChatDelta with token counts, add TokenizeString and Free RPCs, detect CPU and set VLLM_TARGET_DEVICE - Add test_cpu_inference.py: standalone script to validate CPU build with a small model (Qwen2.5-0.5B-Instruct) * fix(vllm): CPU build compatibility with vllm 0.14.1 Validated end-to-end on CPU with Qwen2.5-0.5B-Instruct (LoadModel, Predict, TokenizeString, Free all working). - requirements-cpu-after.txt: pin vllm to 0.14.1+cpu (pre-built wheel from GitHub releases) for x86_64 and aarch64. vllm 0.14.1 is the newest CPU wheel whose torch dependency resolves against published PyTorch builds (torch==2.9.1+cpu). Later vllm CPU wheels currently require torch==2.10.0+cpu which is only available on the PyTorch test channel with incompatible torchvision. - requirements-cpu.txt: bump torch to 2.9.1+cpu, add torchvision/torchaudio so uv resolves them consistently from the PyTorch CPU index. - install.sh: add --index-strategy=unsafe-best-match for CPU builds so uv can mix the PyTorch index and PyPI for transitive deps (matches the existing intel profile behaviour). - backend.py LoadModel: vllm >= 0.14 removed AsyncLLMEngine.get_model_config so the old code path errored out with AttributeError on model load. Switch to the new get_tokenizer()/tokenizer accessor with a fallback to building the tokenizer directly from request.Model. * fix(vllm): tool parser constructor compat + e2e tool calling test Concrete vLLM tool parsers override the abstract base's __init__ and drop the tools kwarg (e.g. Hermes2ProToolParser only takes tokenizer). Instantiating with tools= raised TypeError which was silently caught, leaving chat_deltas.tool_calls empty. Retry the constructor without the tools kwarg on TypeError — tools aren't required by these parsers since extract_tool_calls finds tool syntax in the raw model output directly. Validated with Qwen/Qwen2.5-0.5B-Instruct + hermes parser on CPU: the backend correctly returns ToolCallDelta{name='get_weather', arguments='{"location": "Paris, France"}'} in ChatDelta. test_tool_calls.py is a standalone smoke test that spawns the gRPC backend, sends a chat completion with tools, and asserts the response contains a structured tool call. * ci(backend): build cpu-vllm container image Add the cpu-vllm variant to the backend container build matrix so the image registered in backend/index.yaml (cpu-vllm / cpu-vllm-development) is actually produced by CI. Follows the same pattern as the other CPU python backends (cpu-diffusers, cpu-chatterbox, etc.) with build-type='' and no CUDA. backend_pr.yml auto-picks this up via its matrix filter from backend.yml. * test(e2e-backends): add tools capability + HF model name support Extends tests/e2e-backends to cover backends that: - Resolve HuggingFace model ids natively (vllm, vllm-omni) instead of loading a local file: BACKEND_TEST_MODEL_NAME is passed verbatim as ModelOptions.Model with no download/ModelFile. - Parse tool calls into ChatDelta.tool_calls: new "tools" capability sends a Predict with a get_weather function definition and asserts the Reply contains a matching ToolCallDelta. Uses UseTokenizerTemplate with OpenAI-style Messages so the backend can wire tools into the model's chat template. - Need backend-specific Options[]: BACKEND_TEST_OPTIONS lets a test set e.g. "tool_parser:hermes,reasoning_parser:qwen3" at LoadModel time. Adds make target test-extra-backend-vllm that: - docker-build-vllm - loads Qwen/Qwen2.5-0.5B-Instruct - runs health,load,predict,stream,tools with tool_parser:hermes Drops backend/python/vllm/test_{cpu_inference,tool_calls}.py — those standalone scripts were scaffolding used while bringing up the Python backend; the e2e-backends harness now covers the same ground uniformly alongside llama-cpp and ik-llama-cpp. * ci(test-extra): run vllm e2e tests on CPU Adds tests-vllm-grpc to the test-extra workflow, mirroring the llama-cpp and ik-llama-cpp gRPC jobs. Triggers when files under backend/python/vllm/ change (or on run-all), builds the local-ai vllm container image, and runs the tests/e2e-backends harness with BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct, tool_parser:hermes, and the tools capability enabled. Uses ubuntu-latest (no GPU) — vllm runs on CPU via the cpu-vllm wheel we pinned in requirements-cpu-after.txt. Frees disk space before the build since the docker image + torch + vllm wheel is sizeable. * fix(vllm): build from source on CI to avoid SIGILL on prebuilt wheel The prebuilt vllm 0.14.1+cpu wheel from GitHub releases is compiled with SIMD instructions (AVX-512 VNNI/BF16 or AMX-BF16) that not every CPU supports. GitHub Actions ubuntu-latest runners SIGILL when vllm spawns the model_executor.models.registry subprocess for introspection, so LoadModel never reaches the actual inference path. - install.sh: when FROM_SOURCE=true on a CPU build, temporarily hide requirements-cpu-after.txt so installRequirements installs the base deps + torch CPU without pulling the prebuilt wheel, then clone vllm and compile it with VLLM_TARGET_DEVICE=cpu. The resulting binaries target the host's actual CPU. - backend/Dockerfile.python: accept a FROM_SOURCE build-arg and expose it as an ENV so install.sh sees it during `make`. - Makefile docker-build-backend: forward FROM_SOURCE as --build-arg when set, so backends that need source builds can opt in. - Makefile test-extra-backend-vllm: call docker-build-vllm via a recursive $(MAKE) invocation so FROM_SOURCE flows through. - .github/workflows/test-extra.yml: set FROM_SOURCE=true on the tests-vllm-grpc job. Slower but reliable — the prebuilt wheel only works on hosts that share the build-time SIMD baseline. Answers 'did you test locally?': yes, end-to-end on my local machine with the prebuilt wheel (CPU supports AVX-512 VNNI). The CI runner CPU gap was not covered locally — this commit plugs that gap. * ci(vllm): use bigger-runner instead of source build The prebuilt vllm 0.14.1+cpu wheel requires SIMD instructions (AVX-512 VNNI/BF16) that stock ubuntu-latest GitHub runners don't support — vllm.model_executor.models.registry SIGILLs on import during LoadModel. Source compilation works but takes 30-40 minutes per CI run, which is too slow for an e2e smoke test. Instead, switch tests-vllm-grpc to the bigger-runner self-hosted label (already used by backend.yml for the llama-cpp CUDA build) — that hardware has the required SIMD baseline and the prebuilt wheel runs cleanly. FROM_SOURCE=true is kept as an opt-in escape hatch: - install.sh still has the CPU source-build path for hosts that need it - backend/Dockerfile.python still declares the ARG + ENV - Makefile docker-build-backend still forwards the build-arg when set Default CI path uses the fast prebuilt wheel; source build can be re-enabled by exporting FROM_SOURCE=true in the environment. * ci(vllm): install make + build deps on bigger-runner bigger-runner is a bare self-hosted runner used by backend.yml for docker image builds — it has docker but not the usual ubuntu-latest toolchain. The make-based test target needs make, build-essential (cgo in 'go test'), and curl/unzip (the Makefile protoc target downloads protoc from github releases). protoc-gen-go and protoc-gen-go-grpc come via 'go install' in the install-go-tools target, which setup-go makes possible. * ci(vllm): install libnuma1 + libgomp1 on bigger-runner The vllm 0.14.1+cpu wheel ships a _C C++ extension that dlopens libnuma.so.1 at import time. When the runner host doesn't have it, the extension silently fails to register its torch ops, so EngineCore crashes on init_device with: AttributeError: '_OpNamespace' '_C_utils' object has no attribute 'init_cpu_threads_env' Also add libgomp1 (OpenMP runtime, used by torch CPU kernels) to be safe on stripped-down runners. * feat(vllm): bundle libnuma/libgomp via package.sh The vllm CPU wheel ships a _C extension that dlopens libnuma.so.1 at import time; torch's CPU kernels in turn use libgomp.so.1 (OpenMP). Without these on the host, vllm._C silently fails to register its torch ops and EngineCore crashes with: AttributeError: '_OpNamespace' '_C_utils' object has no attribute 'init_cpu_threads_env' Rather than asking every user to install libnuma1/libgomp1 on their host (or every LocalAI base image to ship them), bundle them into the backend image itself — same pattern fish-speech and the GPU libs already use. libbackend.sh adds ${EDIR}/lib to LD_LIBRARY_PATH at run time so the bundled copies are picked up automatically. - backend/python/vllm/package.sh (new): copies libnuma.so.1 and libgomp.so.1 from the builder's multilib paths into ${BACKEND}/lib, preserving soname symlinks. Runs during Dockerfile.python's 'Run backend-specific packaging' step (which already invokes package.sh if present). - backend/Dockerfile.python: install libnuma1 + libgomp1 in the builder stage so package.sh has something to copy (the Ubuntu base image otherwise only has libgomp in the gcc dep chain). - test-extra.yml: drop the workaround that installed these libs on the runner host — with the backend image self-contained, the runner no longer needs them, and the test now exercises the packaging path end-to-end the way a production host would. * ci(vllm): disable tests-vllm-grpc job (heterogeneous runners) Both ubuntu-latest and bigger-runner have inconsistent CPU baselines: some instances support the AVX-512 VNNI/BF16 instructions the prebuilt vllm 0.14.1+cpu wheel was compiled with, others SIGILL on import of vllm.model_executor.models.registry. The libnuma packaging fix doesn't help when the wheel itself can't be loaded. FROM_SOURCE=true compiles vllm against the actual host CPU and works everywhere, but takes 30-50 minutes per run — too slow for a smoke test on every PR. Comment out the job for now. The test itself is intact and passes locally; run it via 'make test-extra-backend-vllm' on a host with the required SIMD baseline. Re-enable when: - we have a self-hosted runner label with guaranteed AVX-512 VNNI/BF16, or - vllm publishes a CPU wheel with a wider baseline, or - we set up a docker layer cache that makes FROM_SOURCE acceptable The detect-changes vllm output, the test harness changes (tests/ e2e-backends + tools cap), the make target (test-extra-backend-vllm), the package.sh and the Dockerfile/install.sh plumbing all stay in place.
2026-04-13 09:00:29 +00:00
def test_tokenize_string(self):
"""
Tests the TokenizeString RPC returns valid tokens.
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
resp = stub.TokenizeString(backend_pb2.PredictOptions(Prompt="Hello world"))
self.assertGreater(resp.length, 0)
self.assertEqual(len(resp.tokens), resp.length)
except Exception as err:
print(err)
self.fail("TokenizeString service failed")
finally:
self.tearDown()
def test_free(self):
"""
Tests the Free RPC doesn't crash.
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
free_resp = stub.Free(backend_pb2.HealthMessage())
self.assertTrue(free_resp.success)
except Exception as err:
print(err)
self.fail("Free service failed")
finally:
self.tearDown()
def test_embedding(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
self.assertTrue(response.success)
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
embedding_response = stub.Embedding(embedding_request)
self.assertIsNotNone(embedding_response.embeddings)
# assert that is a list of floats
self.assertIsInstance(embedding_response.embeddings, list)
# assert that the list is not empty
self.assertTrue(len(embedding_response.embeddings) > 0)
except Exception as err:
print(err)
self.fail("Embedding service failed")
finally:
self.tearDown()