mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Fix round 2 regressions: ROCm validate_server and Windows HIP routing
Follow-up to 810b833b addressing review findings on the first round of
hardening commits:
- install_llama_prebuilt.py validate_server: gate --n-gpu-layers on the
resolved install_kind instead of host.has_rocm. AMD Windows hosts
without a HIP prebuilt fall back to windows-cpu and must not be
validated with GPU layers; thread install_kind through from the
caller.
- install_llama_prebuilt.py resolve_release_asset_choice: reinstate the
"not has_rocm" guard on the published windows-cpu bundle so AMD
Windows hosts reach resolve_upstream_asset_choice() where the new
HIP prebuilt path lives. Prefer a published windows-hip bundle first
when one exists, fall through to upstream HIP + upstream CPU
otherwise.
- install_llama_prebuilt.py detect_host: also set has_physical_nvidia
when the secondary --query-gpu block confirms a working NVIDIA GPU,
so older nvidia-smi versions without -L support do not silently skip
the Linux diagnostics that key off has_physical_nvidia.
- install_llama_prebuilt.py: drop redundant "import re as _re" /
"import re as _re_rocm" local aliases in favour of the existing
top-level "import re".
- install_python_stack.py _ensure_rocm_torch: run the AMD
bitsandbytes install unconditionally after the HIP-torch probe so
"unsloth studio update" on venvs that already have ROCm torch still
gains the AMD bitsandbytes build.
- install.sh: add a non-x86_64 early-exit to get_torch_index_url() so
aarch64 / arm64 Linux hosts do not hit the ROCm wheel index
(PyTorch only publishes ROCm wheels for linux_x86_64).
- install.sh: add bitsandbytes install to the migrated-environment
branch so upgrades pick it up for ROCm hosts instead of only the
fresh-install path.
- install.sh: in the Radeon wheel path, pass version constraints +
--no-index --find-links to uv instead of explicit wheel URLs so a
version-compatible torch / torchvision / torchaudio triple is
resolved, rather than picking the highest-version wheel for each
package independently.
- studio/backend/utils/hardware/amd.py _first_visible_amd_gpu_id: fall
through to lower-priority visibility env vars when the first entry
is malformed (leading comma, all-whitespace first token) instead of
silently returning GPU 0.
This commit is contained in:
parent
810b833b01
commit
8636fa63fc
4 changed files with 119 additions and 54 deletions
39
install.sh
39
install.sh
|
|
@ -1030,7 +1030,14 @@ get_torch_index_url() {
|
|||
fi
|
||||
fi
|
||||
if [ -z "$_smi" ]; then
|
||||
# No NVIDIA GPU -- check for AMD ROCm GPU
|
||||
# No NVIDIA GPU -- check for AMD ROCm GPU.
|
||||
# PyTorch only publishes ROCm wheels for linux-x86_64; skip the
|
||||
# ROCm branch entirely on aarch64 / arm64 / other architectures
|
||||
# so non-x86_64 Linux hosts fall back cleanly to CPU wheels.
|
||||
case "$(uname -m)" in
|
||||
x86_64|amd64) : ;;
|
||||
*) echo "$_base/cpu"; return ;;
|
||||
esac
|
||||
if ! _has_amd_rocm_gpu; then
|
||||
echo "$_base/cpu"; return
|
||||
fi
|
||||
|
|
@ -1241,6 +1248,17 @@ if [ "$_MIGRATED" = true ]; then
|
|||
substep "overlaying local repo (editable)..."
|
||||
run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
|
||||
fi
|
||||
# AMD ROCm: install bitsandbytes even in migrated environments so
|
||||
# existing ROCm installs gain the AMD bitsandbytes build without a
|
||||
# fresh reinstall.
|
||||
if [ "$SKIP_TORCH" = false ]; then
|
||||
case "$TORCH_INDEX_URL" in
|
||||
*/rocm*)
|
||||
substep "installing bitsandbytes for AMD ROCm..."
|
||||
run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
elif [ -n "$TORCH_INDEX_URL" ]; then
|
||||
# Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac)
|
||||
if [ "$SKIP_TORCH" = true ]; then
|
||||
|
|
@ -1277,14 +1295,29 @@ elif [ -n "$TORCH_INDEX_URL" ]; then
|
|||
--index-url "$TORCH_INDEX_URL"
|
||||
else
|
||||
substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..."
|
||||
# Use version constraints + --find-links + --no-index so
|
||||
# uv resolves a compatible torch / torchvision / torchaudio
|
||||
# set from the Radeon listing (instead of picking the
|
||||
# highest-version wheel for each package independently,
|
||||
# which can assemble a version-mismatched stack).
|
||||
# The wheel presence check above guarantees the listing
|
||||
# has at least one wheel per package; uv will pick the
|
||||
# newest version-compatible triple.
|
||||
if [ -n "$_tri_whl" ]; then
|
||||
run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \
|
||||
--no-index \
|
||||
--find-links "$_RADEON_BASE_URL" \
|
||||
"$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl"
|
||||
"$TORCH_CONSTRAINT" \
|
||||
"torchvision<0.26.0" \
|
||||
"torchaudio<2.11.0" \
|
||||
"triton<3.7"
|
||||
else
|
||||
run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
|
||||
--no-index \
|
||||
--find-links "$_RADEON_BASE_URL" \
|
||||
"$_torch_whl" "$_tv_whl" "$_ta_whl"
|
||||
"$TORCH_CONSTRAINT" \
|
||||
"torchvision<0.26.0" \
|
||||
"torchaudio<2.11.0"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
|
|
|
|||
|
|
@ -220,7 +220,10 @@ def _first_visible_amd_gpu_id() -> Optional[str]:
|
|||
first = raw.split(",", 1)[0].strip()
|
||||
if first:
|
||||
return first
|
||||
break
|
||||
# Leading comma or all-whitespace first token -- fall through to
|
||||
# the next env var in priority order rather than silently
|
||||
# returning GPU 0.
|
||||
continue
|
||||
return "0"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2554,6 +2554,12 @@ def detect_host() -> HostInfo:
|
|||
|
||||
if visible_gpu_rows:
|
||||
has_usable_nvidia = True
|
||||
# Older nvidia-smi versions (pre -L support) hit the
|
||||
# except in the first try block but still succeed here,
|
||||
# leaving has_physical_nvidia unset. Mirror the -L path
|
||||
# so downstream diagnostics on line ~4390 still run.
|
||||
if not has_physical_nvidia:
|
||||
has_physical_nvidia = True
|
||||
elif visible_device_tokens == []:
|
||||
has_usable_nvidia = False
|
||||
elif supports_explicit_visible_device_matching(visible_device_tokens):
|
||||
|
|
@ -2564,17 +2570,16 @@ def detect_host() -> HostInfo:
|
|||
pass
|
||||
|
||||
# Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed
|
||||
import re as _re
|
||||
|
||||
def _amd_smi_has_gpu(stdout: str) -> bool:
|
||||
"""Check for 'GPU: <number>' data rows, not just a table header."""
|
||||
return bool(_re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
|
||||
return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
|
||||
|
||||
has_rocm = False
|
||||
if is_linux:
|
||||
for _cmd, _check in (
|
||||
# rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent)
|
||||
(["rocminfo"], lambda out: bool(_re.search(r"gfx[1-9]", out.lower()))),
|
||||
(["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))),
|
||||
(["amd-smi", "list"], _amd_smi_has_gpu),
|
||||
):
|
||||
_exe = shutil.which(_cmd[0])
|
||||
|
|
@ -3000,10 +3005,8 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
|
|||
# the newest one. Hardcoding a single rocm-7.2 filename means
|
||||
# ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a
|
||||
# source build even when a matching prebuilt exists upstream.
|
||||
import re as _re_rocm
|
||||
|
||||
_rocm_pattern = _re_rocm.compile(
|
||||
rf"llama-{_re_rocm.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
|
||||
_rocm_pattern = re.compile(
|
||||
rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
|
||||
)
|
||||
rocm_candidates: list[tuple[tuple[int, ...], str]] = []
|
||||
for _name in upstream_assets:
|
||||
|
|
@ -3153,13 +3156,20 @@ def resolve_release_asset_choice(
|
|||
|
||||
published_choice: AssetChoice | None = None
|
||||
if host.is_windows and host.is_x86_64:
|
||||
# Always try the published Windows CPU bundle, even on AMD ROCm
|
||||
# hosts. If a windows-hip bundle is added to published releases
|
||||
# in the future, the upstream resolver below would pick it first
|
||||
# via resolve_asset_choice; falling back to the hash-approved
|
||||
# windows-cpu bundle is still better than the upstream CPU
|
||||
# asset for AMD Windows hosts without a HIP prebuilt.
|
||||
published_choice = published_asset_choice_for_kind(release, "windows-cpu")
|
||||
# AMD Windows hosts should prefer a hash-approved published
|
||||
# Windows HIP bundle when one exists, but otherwise fall through
|
||||
# to resolve_asset_choice() so the upstream HIP prebuilt is
|
||||
# tried before the CPU fallback. Hard-pinning the published
|
||||
# windows-cpu bundle here would make the new HIP path
|
||||
# unreachable.
|
||||
if host.has_rocm:
|
||||
published_choice = published_asset_choice_for_kind(
|
||||
release, "windows-hip"
|
||||
)
|
||||
else:
|
||||
published_choice = published_asset_choice_for_kind(
|
||||
release, "windows-cpu"
|
||||
)
|
||||
elif host.is_macos and host.is_arm64:
|
||||
published_choice = published_asset_choice_for_kind(release, "macos-arm64")
|
||||
elif host.is_macos and host.is_x86_64:
|
||||
|
|
@ -4248,6 +4258,7 @@ def validate_server(
|
|||
install_dir: Path,
|
||||
*,
|
||||
runtime_line: str | None = None,
|
||||
install_kind: str | None = None,
|
||||
) -> None:
|
||||
last_failure: PrebuiltFallback | None = None
|
||||
for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1):
|
||||
|
|
@ -4271,11 +4282,21 @@ def validate_server(
|
|||
"--batch-size",
|
||||
"32",
|
||||
]
|
||||
if (
|
||||
host.has_usable_nvidia
|
||||
or host.has_rocm
|
||||
or (host.is_macos and host.is_arm64)
|
||||
):
|
||||
# Only enable GPU offload for assets that actually ship GPU code.
|
||||
# Gating on `host.has_rocm` alone breaks the intentional CPU
|
||||
# fallback on AMD Windows hosts without a HIP prebuilt: the CPU
|
||||
# binary would be launched with `--n-gpu-layers 1` and fail
|
||||
# validation. Use the resolved install_kind as the source of
|
||||
# truth and fall back to host detection when the caller did not
|
||||
# pass one (keeps backwards compatibility with older call sites).
|
||||
_gpu_kinds = {"linux-cuda", "linux-rocm", "windows-cuda", "windows-hip", "macos-arm64"}
|
||||
if install_kind is not None:
|
||||
_enable_gpu_layers = install_kind in _gpu_kinds
|
||||
else:
|
||||
_enable_gpu_layers = host.has_usable_nvidia or (
|
||||
host.is_macos and host.is_arm64
|
||||
)
|
||||
if _enable_gpu_layers:
|
||||
command.extend(["--n-gpu-layers", "1"])
|
||||
|
||||
log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log")
|
||||
|
|
@ -4985,6 +5006,7 @@ def validate_prebuilt_choice(
|
|||
host,
|
||||
install_dir,
|
||||
runtime_line = choice.runtime_line,
|
||||
install_kind = choice.install_kind,
|
||||
)
|
||||
log(f"staged prebuilt validation succeeded for {choice.name}")
|
||||
return server_path, quantize_path
|
||||
|
|
|
|||
|
|
@ -185,9 +185,9 @@ def _ensure_rocm_torch() -> None:
|
|||
print(" ROCm detected but version unreadable -- skipping torch reinstall")
|
||||
return
|
||||
|
||||
# Skip if torch already links against HIP (ROCm is already working).
|
||||
# Do NOT skip for CUDA-only builds since they are unusable on AMD-only hosts
|
||||
# (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
|
||||
# Probe whether torch already links against HIP (ROCm is already working).
|
||||
# Do NOT skip for CUDA-only builds since they are unusable on AMD-only
|
||||
# hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
|
||||
try:
|
||||
probe = subprocess.run(
|
||||
[
|
||||
|
|
@ -201,36 +201,43 @@ def _ensure_rocm_torch() -> None:
|
|||
)
|
||||
except (OSError, subprocess.TimeoutExpired):
|
||||
probe = None
|
||||
if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip():
|
||||
return # torch already has HIP/ROCm backend
|
||||
|
||||
# Select best matching wheel tag (newest ROCm version <= installed)
|
||||
tag = next(
|
||||
(
|
||||
t
|
||||
for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
|
||||
if ver >= (maj, mn)
|
||||
),
|
||||
None,
|
||||
has_hip_torch = (
|
||||
probe is not None
|
||||
and probe.returncode == 0
|
||||
and probe.stdout.decode().strip() != ""
|
||||
)
|
||||
if tag is None:
|
||||
print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
|
||||
return
|
||||
|
||||
index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
|
||||
print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
|
||||
pip_install(
|
||||
f"ROCm torch ({tag})",
|
||||
"--force-reinstall",
|
||||
"--no-cache-dir",
|
||||
"torch>=2.4,<2.11.0",
|
||||
"torchvision<0.26.0",
|
||||
"torchaudio<2.11.0",
|
||||
"--index-url",
|
||||
index_url,
|
||||
constrain = False,
|
||||
)
|
||||
# Also install bitsandbytes for AMD
|
||||
if not has_hip_torch:
|
||||
# Select best matching wheel tag (newest ROCm version <= installed)
|
||||
tag = next(
|
||||
(
|
||||
t
|
||||
for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
|
||||
if ver >= (maj, mn)
|
||||
),
|
||||
None,
|
||||
)
|
||||
if tag is None:
|
||||
print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping")
|
||||
return
|
||||
|
||||
index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
|
||||
print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
|
||||
pip_install(
|
||||
f"ROCm torch ({tag})",
|
||||
"--force-reinstall",
|
||||
"--no-cache-dir",
|
||||
"torch>=2.4,<2.11.0",
|
||||
"torchvision<0.26.0",
|
||||
"torchaudio<2.11.0",
|
||||
"--index-url",
|
||||
index_url,
|
||||
constrain = False,
|
||||
)
|
||||
|
||||
# Always install bitsandbytes for AMD -- runs even when torch was not
|
||||
# reinstalled (e.g. "unsloth studio update" on a venv that already has
|
||||
# ROCm torch) so the AMD bitsandbytes dependency is not left missing.
|
||||
pip_install(
|
||||
"bitsandbytes (AMD)",
|
||||
"--no-cache-dir",
|
||||
|
|
|
|||
Loading…
Reference in a new issue