Fix round 2 regressions: ROCm validate_server and Windows HIP routing

Follow-up to 810b833b addressing review findings on the first round of
hardening commits:

- install_llama_prebuilt.py validate_server: gate --n-gpu-layers on the
  resolved install_kind instead of host.has_rocm. AMD Windows hosts
  without a HIP prebuilt fall back to windows-cpu and must not be
  validated with GPU layers; thread install_kind through from the
  caller.
- install_llama_prebuilt.py resolve_release_asset_choice: reinstate the
  "not has_rocm" guard on the published windows-cpu bundle so AMD
  Windows hosts reach resolve_upstream_asset_choice() where the new
  HIP prebuilt path lives. Prefer a published windows-hip bundle first
  when one exists, fall through to upstream HIP + upstream CPU
  otherwise.
- install_llama_prebuilt.py detect_host: also set has_physical_nvidia
  when the secondary --query-gpu block confirms a working NVIDIA GPU,
  so older nvidia-smi versions without -L support do not silently skip
  the Linux diagnostics that key off has_physical_nvidia.
- install_llama_prebuilt.py: drop redundant "import re as _re" /
  "import re as _re_rocm" local aliases in favour of the existing
  top-level "import re".
- install_python_stack.py _ensure_rocm_torch: run the AMD
  bitsandbytes install unconditionally after the HIP-torch probe so
  "unsloth studio update" on venvs that already have ROCm torch still
  gains the AMD bitsandbytes build.
- install.sh: add a non-x86_64 early-exit to get_torch_index_url() so
  aarch64 / arm64 Linux hosts do not hit the ROCm wheel index
  (PyTorch only publishes ROCm wheels for linux_x86_64).
- install.sh: add bitsandbytes install to the migrated-environment
  branch so upgrades pick it up for ROCm hosts instead of only the
  fresh-install path.
- install.sh: in the Radeon wheel path, pass version constraints +
  --no-index --find-links to uv instead of explicit wheel URLs so a
  version-compatible torch / torchvision / torchaudio triple is
  resolved, rather than picking the highest-version wheel for each
  package independently.
- studio/backend/utils/hardware/amd.py _first_visible_amd_gpu_id: fall
  through to lower-priority visibility env vars when the first entry
  is malformed (leading comma, all-whitespace first token) instead of
  silently returning GPU 0.
This commit is contained in:
Daniel Han 2026-04-08 09:46:41 +00:00
parent 810b833b01
commit 8636fa63fc
4 changed files with 119 additions and 54 deletions

View file

@ -1030,7 +1030,14 @@ get_torch_index_url() {
fi
fi
if [ -z "$_smi" ]; then
# No NVIDIA GPU -- check for AMD ROCm GPU
# No NVIDIA GPU -- check for AMD ROCm GPU.
# PyTorch only publishes ROCm wheels for linux-x86_64; skip the
# ROCm branch entirely on aarch64 / arm64 / other architectures
# so non-x86_64 Linux hosts fall back cleanly to CPU wheels.
case "$(uname -m)" in
x86_64|amd64) : ;;
*) echo "$_base/cpu"; return ;;
esac
if ! _has_amd_rocm_gpu; then
echo "$_base/cpu"; return
fi
@ -1241,6 +1248,17 @@ if [ "$_MIGRATED" = true ]; then
substep "overlaying local repo (editable)..."
run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
fi
# AMD ROCm: install bitsandbytes even in migrated environments so
# existing ROCm installs gain the AMD bitsandbytes build without a
# fresh reinstall.
if [ "$SKIP_TORCH" = false ]; then
case "$TORCH_INDEX_URL" in
*/rocm*)
substep "installing bitsandbytes for AMD ROCm..."
run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1"
;;
esac
fi
elif [ -n "$TORCH_INDEX_URL" ]; then
# Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac)
if [ "$SKIP_TORCH" = true ]; then
@ -1277,14 +1295,29 @@ elif [ -n "$TORCH_INDEX_URL" ]; then
--index-url "$TORCH_INDEX_URL"
else
substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..."
# Use version constraints + --find-links + --no-index so
# uv resolves a compatible torch / torchvision / torchaudio
# set from the Radeon listing (instead of picking the
# highest-version wheel for each package independently,
# which can assemble a version-mismatched stack).
# The wheel presence check above guarantees the listing
# has at least one wheel per package; uv will pick the
# newest version-compatible triple.
if [ -n "$_tri_whl" ]; then
run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \
--no-index \
--find-links "$_RADEON_BASE_URL" \
"$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl"
"$TORCH_CONSTRAINT" \
"torchvision<0.26.0" \
"torchaudio<2.11.0" \
"triton<3.7"
else
run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
--no-index \
--find-links "$_RADEON_BASE_URL" \
"$_torch_whl" "$_tv_whl" "$_ta_whl"
"$TORCH_CONSTRAINT" \
"torchvision<0.26.0" \
"torchaudio<2.11.0"
fi
fi
else

View file

@ -220,7 +220,10 @@ def _first_visible_amd_gpu_id() -> Optional[str]:
first = raw.split(",", 1)[0].strip()
if first:
return first
break
# Leading comma or all-whitespace first token -- fall through to
# the next env var in priority order rather than silently
# returning GPU 0.
continue
return "0"

View file

@ -2554,6 +2554,12 @@ def detect_host() -> HostInfo:
if visible_gpu_rows:
has_usable_nvidia = True
# Older nvidia-smi versions (pre -L support) hit the
# except in the first try block but still succeed here,
# leaving has_physical_nvidia unset. Mirror the -L path
# so downstream diagnostics on line ~4390 still run.
if not has_physical_nvidia:
has_physical_nvidia = True
elif visible_device_tokens == []:
has_usable_nvidia = False
elif supports_explicit_visible_device_matching(visible_device_tokens):
@ -2564,17 +2570,16 @@ def detect_host() -> HostInfo:
pass
# Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed
import re as _re
def _amd_smi_has_gpu(stdout: str) -> bool:
"""Check for 'GPU: <number>' data rows, not just a table header."""
return bool(_re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
has_rocm = False
if is_linux:
for _cmd, _check in (
# rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent)
(["rocminfo"], lambda out: bool(_re.search(r"gfx[1-9]", out.lower()))),
(["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))),
(["amd-smi", "list"], _amd_smi_has_gpu),
):
_exe = shutil.which(_cmd[0])
@ -3000,10 +3005,8 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
# the newest one. Hardcoding a single rocm-7.2 filename means
# ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a
# source build even when a matching prebuilt exists upstream.
import re as _re_rocm
_rocm_pattern = _re_rocm.compile(
rf"llama-{_re_rocm.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
_rocm_pattern = re.compile(
rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
)
rocm_candidates: list[tuple[tuple[int, ...], str]] = []
for _name in upstream_assets:
@ -3153,13 +3156,20 @@ def resolve_release_asset_choice(
published_choice: AssetChoice | None = None
if host.is_windows and host.is_x86_64:
# Always try the published Windows CPU bundle, even on AMD ROCm
# hosts. If a windows-hip bundle is added to published releases
# in the future, the upstream resolver below would pick it first
# via resolve_asset_choice; falling back to the hash-approved
# windows-cpu bundle is still better than the upstream CPU
# asset for AMD Windows hosts without a HIP prebuilt.
published_choice = published_asset_choice_for_kind(release, "windows-cpu")
# AMD Windows hosts should prefer a hash-approved published
# Windows HIP bundle when one exists, but otherwise fall through
# to resolve_asset_choice() so the upstream HIP prebuilt is
# tried before the CPU fallback. Hard-pinning the published
# windows-cpu bundle here would make the new HIP path
# unreachable.
if host.has_rocm:
published_choice = published_asset_choice_for_kind(
release, "windows-hip"
)
else:
published_choice = published_asset_choice_for_kind(
release, "windows-cpu"
)
elif host.is_macos and host.is_arm64:
published_choice = published_asset_choice_for_kind(release, "macos-arm64")
elif host.is_macos and host.is_x86_64:
@ -4248,6 +4258,7 @@ def validate_server(
install_dir: Path,
*,
runtime_line: str | None = None,
install_kind: str | None = None,
) -> None:
last_failure: PrebuiltFallback | None = None
for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1):
@ -4271,11 +4282,21 @@ def validate_server(
"--batch-size",
"32",
]
if (
host.has_usable_nvidia
or host.has_rocm
or (host.is_macos and host.is_arm64)
):
# Only enable GPU offload for assets that actually ship GPU code.
# Gating on `host.has_rocm` alone breaks the intentional CPU
# fallback on AMD Windows hosts without a HIP prebuilt: the CPU
# binary would be launched with `--n-gpu-layers 1` and fail
# validation. Use the resolved install_kind as the source of
# truth and fall back to host detection when the caller did not
# pass one (keeps backwards compatibility with older call sites).
_gpu_kinds = {"linux-cuda", "linux-rocm", "windows-cuda", "windows-hip", "macos-arm64"}
if install_kind is not None:
_enable_gpu_layers = install_kind in _gpu_kinds
else:
_enable_gpu_layers = host.has_usable_nvidia or (
host.is_macos and host.is_arm64
)
if _enable_gpu_layers:
command.extend(["--n-gpu-layers", "1"])
log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log")
@ -4985,6 +5006,7 @@ def validate_prebuilt_choice(
host,
install_dir,
runtime_line = choice.runtime_line,
install_kind = choice.install_kind,
)
log(f"staged prebuilt validation succeeded for {choice.name}")
return server_path, quantize_path

View file

@ -185,9 +185,9 @@ def _ensure_rocm_torch() -> None:
print(" ROCm detected but version unreadable -- skipping torch reinstall")
return
# Skip if torch already links against HIP (ROCm is already working).
# Do NOT skip for CUDA-only builds since they are unusable on AMD-only hosts
# (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
# Probe whether torch already links against HIP (ROCm is already working).
# Do NOT skip for CUDA-only builds since they are unusable on AMD-only
# hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
try:
probe = subprocess.run(
[
@ -201,36 +201,43 @@ def _ensure_rocm_torch() -> None:
)
except (OSError, subprocess.TimeoutExpired):
probe = None
if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip():
return # torch already has HIP/ROCm backend
# Select best matching wheel tag (newest ROCm version <= installed)
tag = next(
(
t
for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
if ver >= (maj, mn)
),
None,
has_hip_torch = (
probe is not None
and probe.returncode == 0
and probe.stdout.decode().strip() != ""
)
if tag is None:
print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
return
index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
pip_install(
f"ROCm torch ({tag})",
"--force-reinstall",
"--no-cache-dir",
"torch>=2.4,<2.11.0",
"torchvision<0.26.0",
"torchaudio<2.11.0",
"--index-url",
index_url,
constrain = False,
)
# Also install bitsandbytes for AMD
if not has_hip_torch:
# Select best matching wheel tag (newest ROCm version <= installed)
tag = next(
(
t
for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
if ver >= (maj, mn)
),
None,
)
if tag is None:
print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
return
index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
pip_install(
f"ROCm torch ({tag})",
"--force-reinstall",
"--no-cache-dir",
"torch>=2.4,<2.11.0",
"torchvision<0.26.0",
"torchaudio<2.11.0",
"--index-url",
index_url,
constrain = False,
)
# Always install bitsandbytes for AMD -- runs even when torch was not
# reinstalled (e.g. "unsloth studio update" on a venv that already has
# ROCm torch) so the AMD bitsandbytes dependency is not left missing.
pip_install(
"bitsandbytes (AMD)",
"--no-cache-dir",