Fix round 2 regressions: ROCm validate_server and Windows HIP routing

Follow-up to 810b833b addressing review findings on the first round of hardening commits: - install_llama_prebuilt.py validate_server: gate --n-gpu-layers on the resolved install_kind instead of host.has_rocm. AMD Windows hosts without a HIP prebuilt fall back to windows-cpu and must not be validated with GPU layers; thread install_kind through from the caller. - install_llama_prebuilt.py resolve_release_asset_choice: reinstate the "not has_rocm" guard on the published windows-cpu bundle so AMD Windows hosts reach resolve_upstream_asset_choice() where the new HIP prebuilt path lives. Prefer a published windows-hip bundle first when one exists, fall through to upstream HIP + upstream CPU otherwise. - install_llama_prebuilt.py detect_host: also set has_physical_nvidia when the secondary --query-gpu block confirms a working NVIDIA GPU, so older nvidia-smi versions without -L support do not silently skip the Linux diagnostics that key off has_physical_nvidia. - install_llama_prebuilt.py: drop redundant "import re as _re" / "import re as _re_rocm" local aliases in favour of the existing top-level "import re". - install_python_stack.py _ensure_rocm_torch: run the AMD bitsandbytes install unconditionally after the HIP-torch probe so "unsloth studio update" on venvs that already have ROCm torch still gains the AMD bitsandbytes build. - install.sh: add a non-x86_64 early-exit to get_torch_index_url() so aarch64 / arm64 Linux hosts do not hit the ROCm wheel index (PyTorch only publishes ROCm wheels for linux_x86_64). - install.sh: add bitsandbytes install to the migrated-environment branch so upgrades pick it up for ROCm hosts instead of only the fresh-install path. - install.sh: in the Radeon wheel path, pass version constraints + --no-index --find-links to uv instead of explicit wheel URLs so a version-compatible torch / torchvision / torchaudio triple is resolved, rather than picking the highest-version wheel for each package independently. - studio/backend/utils/hardware/amd.py _first_visible_amd_gpu_id: fall through to lower-priority visibility env vars when the first entry is malformed (leading comma, all-whitespace first token) instead of silently returning GPU 0.
2026-04-21 13:37:39 +00:00 · 2026-04-08 09:46:41 +00:00 · 2026-04-08 09:46:41 +00:00 · 8636fa63fc
commit 8636fa63fc
parent 810b833b01
4 changed files with 119 additions and 54 deletions
--- a/install.sh
+++ b/install.sh
@ -1030,7 +1030,14 @@ get_torch_index_url() {
        fi
    fi
    if [ -z "$_smi" ]; then
-        # No NVIDIA GPU -- check for AMD ROCm GPU
+        # No NVIDIA GPU -- check for AMD ROCm GPU.
+        # PyTorch only publishes ROCm wheels for linux-x86_64; skip the
+        # ROCm branch entirely on aarch64 / arm64 / other architectures
+        # so non-x86_64 Linux hosts fall back cleanly to CPU wheels.
+        case "$(uname -m)" in
+            x86_64|amd64) : ;;
+            *) echo "$_base/cpu"; return ;;
+        esac
        if ! _has_amd_rocm_gpu; then
            echo "$_base/cpu"; return
        fi
@ -1241,6 +1248,17 @@ if [ "$_MIGRATED" = true ]; then
        substep "overlaying local repo (editable)..."
        run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
    fi
+    # AMD ROCm: install bitsandbytes even in migrated environments so
+    # existing ROCm installs gain the AMD bitsandbytes build without a
+    # fresh reinstall.
+    if [ "$SKIP_TORCH" = false ]; then
+        case "$TORCH_INDEX_URL" in
+            */rocm*)
+                substep "installing bitsandbytes for AMD ROCm..."
+                run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1"
+                ;;
+        esac
+    fi
 elif [ -n "$TORCH_INDEX_URL" ]; then
    # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac)
    if [ "$SKIP_TORCH" = true ]; then
@ -1277,14 +1295,29 @@ elif [ -n "$TORCH_INDEX_URL" ]; then
                        --index-url "$TORCH_INDEX_URL"
                else
                    substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..."
+                    # Use version constraints + --find-links + --no-index so
+                    # uv resolves a compatible torch / torchvision / torchaudio
+                    # set from the Radeon listing (instead of picking the
+                    # highest-version wheel for each package independently,
+                    # which can assemble a version-mismatched stack).
+                    # The wheel presence check above guarantees the listing
+                    # has at least one wheel per package; uv will pick the
+                    # newest version-compatible triple.
                    if [ -n "$_tri_whl" ]; then
                        run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \
+                            --no-index \
                            --find-links "$_RADEON_BASE_URL" \
-                            "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl"
+                            "$TORCH_CONSTRAINT" \
+                            "torchvision<0.26.0" \
+                            "torchaudio<2.11.0" \
+                            "triton<3.7"
                    else
                        run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
+                            --no-index \
                            --find-links "$_RADEON_BASE_URL" \
-                            "$_torch_whl" "$_tv_whl" "$_ta_whl"
+                            "$TORCH_CONSTRAINT" \
+                            "torchvision<0.26.0" \
+                            "torchaudio<2.11.0"
                    fi
                fi
            else
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@ -220,7 +220,10 @@ def _first_visible_amd_gpu_id() -> Optional[str]:
        first = raw.split(",", 1)[0].strip()
        if first:
            return first
-        break
+        # Leading comma or all-whitespace first token -- fall through to
+        # the next env var in priority order rather than silently
+        # returning GPU 0.
+        continue
    return "0"


--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@ -2554,6 +2554,12 @@ def detect_host() -> HostInfo:

            if visible_gpu_rows:
                has_usable_nvidia = True
+                # Older nvidia-smi versions (pre -L support) hit the
+                # except in the first try block but still succeed here,
+                # leaving has_physical_nvidia unset. Mirror the -L path
+                # so downstream diagnostics on line ~4390 still run.
+                if not has_physical_nvidia:
+                    has_physical_nvidia = True
            elif visible_device_tokens == []:
                has_usable_nvidia = False
            elif supports_explicit_visible_device_matching(visible_device_tokens):
@ -2564,17 +2570,16 @@ def detect_host() -> HostInfo:
            pass

    # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed
-    import re as _re

    def _amd_smi_has_gpu(stdout: str) -> bool:
        """Check for 'GPU: <number>' data rows, not just a table header."""
-        return bool(_re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
+        return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))

    has_rocm = False
    if is_linux:
        for _cmd, _check in (
            # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent)
-            (["rocminfo"], lambda out: bool(_re.search(r"gfx[1-9]", out.lower()))),
+            (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))),
            (["amd-smi", "list"], _amd_smi_has_gpu),
        ):
            _exe = shutil.which(_cmd[0])
@ -3000,10 +3005,8 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
            # the newest one. Hardcoding a single rocm-7.2 filename means
            # ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a
            # source build even when a matching prebuilt exists upstream.
-            import re as _re_rocm
-
-            _rocm_pattern = _re_rocm.compile(
-                rf"llama-{_re_rocm.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
+            _rocm_pattern = re.compile(
+                rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
            )
            rocm_candidates: list[tuple[tuple[int, ...], str]] = []
            for _name in upstream_assets:
@ -3153,13 +3156,20 @@ def resolve_release_asset_choice(

    published_choice: AssetChoice | None = None
    if host.is_windows and host.is_x86_64:
-        # Always try the published Windows CPU bundle, even on AMD ROCm
-        # hosts. If a windows-hip bundle is added to published releases
-        # in the future, the upstream resolver below would pick it first
-        # via resolve_asset_choice; falling back to the hash-approved
-        # windows-cpu bundle is still better than the upstream CPU
-        # asset for AMD Windows hosts without a HIP prebuilt.
-        published_choice = published_asset_choice_for_kind(release, "windows-cpu")
+        # AMD Windows hosts should prefer a hash-approved published
+        # Windows HIP bundle when one exists, but otherwise fall through
+        # to resolve_asset_choice() so the upstream HIP prebuilt is
+        # tried before the CPU fallback. Hard-pinning the published
+        # windows-cpu bundle here would make the new HIP path
+        # unreachable.
+        if host.has_rocm:
+            published_choice = published_asset_choice_for_kind(
+                release, "windows-hip"
+            )
+        else:
+            published_choice = published_asset_choice_for_kind(
+                release, "windows-cpu"
+            )
    elif host.is_macos and host.is_arm64:
        published_choice = published_asset_choice_for_kind(release, "macos-arm64")
    elif host.is_macos and host.is_x86_64:
@ -4248,6 +4258,7 @@ def validate_server(
    install_dir: Path,
    *,
    runtime_line: str | None = None,
+    install_kind: str | None = None,
 ) -> None:
    last_failure: PrebuiltFallback | None = None
    for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1):
@ -4271,11 +4282,21 @@ def validate_server(
            "--batch-size",
            "32",
        ]
-        if (
-            host.has_usable_nvidia
-            or host.has_rocm
-            or (host.is_macos and host.is_arm64)
-        ):
+        # Only enable GPU offload for assets that actually ship GPU code.
+        # Gating on `host.has_rocm` alone breaks the intentional CPU
+        # fallback on AMD Windows hosts without a HIP prebuilt: the CPU
+        # binary would be launched with `--n-gpu-layers 1` and fail
+        # validation. Use the resolved install_kind as the source of
+        # truth and fall back to host detection when the caller did not
+        # pass one (keeps backwards compatibility with older call sites).
+        _gpu_kinds = {"linux-cuda", "linux-rocm", "windows-cuda", "windows-hip", "macos-arm64"}
+        if install_kind is not None:
+            _enable_gpu_layers = install_kind in _gpu_kinds
+        else:
+            _enable_gpu_layers = host.has_usable_nvidia or (
+                host.is_macos and host.is_arm64
+            )
+        if _enable_gpu_layers:
            command.extend(["--n-gpu-layers", "1"])

        log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log")
@ -4985,6 +5006,7 @@ def validate_prebuilt_choice(
        host,
        install_dir,
        runtime_line = choice.runtime_line,
+        install_kind = choice.install_kind,
    )
    log(f"staged prebuilt validation succeeded for {choice.name}")
    return server_path, quantize_path
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@ -185,9 +185,9 @@ def _ensure_rocm_torch() -> None:
        print("   ROCm detected but version unreadable -- skipping torch reinstall")
        return

-    # Skip if torch already links against HIP (ROCm is already working).
-    # Do NOT skip for CUDA-only builds since they are unusable on AMD-only hosts
-    # (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
+    # Probe whether torch already links against HIP (ROCm is already working).
+    # Do NOT skip for CUDA-only builds since they are unusable on AMD-only
+    # hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
    try:
        probe = subprocess.run(
            [
@ -201,36 +201,43 @@ def _ensure_rocm_torch() -> None:
        )
    except (OSError, subprocess.TimeoutExpired):
        probe = None
-    if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip():
-        return  # torch already has HIP/ROCm backend
-
-    # Select best matching wheel tag (newest ROCm version <= installed)
-    tag = next(
-        (
-            t
-            for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
-            if ver >= (maj, mn)
-        ),
-        None,
+    has_hip_torch = (
+        probe is not None
+        and probe.returncode == 0
+        and probe.stdout.decode().strip() != ""
    )
-    if tag is None:
-        print(f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
-        return

-    index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
-    print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
-    pip_install(
-        f"ROCm torch ({tag})",
-        "--force-reinstall",
-        "--no-cache-dir",
-        "torch>=2.4,<2.11.0",
-        "torchvision<0.26.0",
-        "torchaudio<2.11.0",
-        "--index-url",
-        index_url,
-        constrain = False,
-    )
-    # Also install bitsandbytes for AMD
+    if not has_hip_torch:
+        # Select best matching wheel tag (newest ROCm version <= installed)
+        tag = next(
+            (
+                t
+                for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
+                if ver >= (maj, mn)
+            ),
+            None,
+        )
+        if tag is None:
+            print(f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
+            return
+
+        index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
+        print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
+        pip_install(
+            f"ROCm torch ({tag})",
+            "--force-reinstall",
+            "--no-cache-dir",
+            "torch>=2.4,<2.11.0",
+            "torchvision<0.26.0",
+            "torchaudio<2.11.0",
+            "--index-url",
+            index_url,
+            constrain = False,
+        )
+
+    # Always install bitsandbytes for AMD -- runs even when torch was not
+    # reinstalled (e.g. "unsloth studio update" on a venv that already has
+    # ROCm torch) so the AMD bitsandbytes dependency is not left missing.
    pip_install(
        "bitsandbytes (AMD)",
        "--no-cache-dir",