mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Fix gemini round 6 + URL audit: amd.py defensive checks, rocm6.5+ clip to 6.4
Two rounds of fixes in one commit, plus a full URL audit of every PyPI /
download.pytorch.org / repo.radeon.com reference the PR introduces.
amd.py (4 medium gemini findings on commit b3627bc2):
1. _extract_gpu_metrics used `and vram_total_mb` as part of the vram_util
gate. The follow-up `vram_total_mb > 0` already handles the division
guard, but the truthiness check was redundant and slightly surprising
for a 0.0 valid value. Replace with explicit `is not None and > 0`
for both vram_util and power_util.
2. get_physical_gpu_count called `data.get("gpu", ...)` without guarding
for non-dict envelopes. A scalar / string JSON response from amd-smi
would raise AttributeError. Add an isinstance(data, dict) check and
return None for unexpected shapes.
3. get_visible_gpu_utilization had the same .get() exposure on the outer
envelope. Rewrite the gpu_list extraction as an explicit
list/dict/else cascade so a malformed scalar envelope produces
gpu_list=[data] and continues without raising.
4. The same function's per-entry loop also called gpu_data.get() on
whatever was inside gpu_list. If a scalar ever leaks into the list
(directly or via the previous fix's fallback), _extract_gpu_metrics
would raise on the first .get() inside the helper. Skip non-dict
entries in the loop before extracting metrics.
install.sh (URL audit finding, previously flagged by 20-reviewer as #13):
5. get_torch_index_url used `rocm6.*` in the rocm tag case statement,
which matched rocm6.5 and rocm6.6 and emitted
download.pytorch.org/whl/rocm6.5 -- which returns HTTP 403 because
PyTorch only publishes rocm 5.7, 6.0-6.4, 7.0-7.2. Enumerate the
supported 6.x minors explicitly and add a rocm6.* fallback branch
that clips to rocm6.4 (the last supported 6.x wheel set).
URL audit results (all URLs PR 4720 references):
- 14/14 download.pytorch.org/whl/{cpu,cu118,cu124,cu126,cu128,cu130,
rocm6.0..6.4,rocm7.0..7.2} return HTTP 200.
- 9/9 repo.radeon.com/rocm/manylinux/rocm-rel-{5.7,6.0,6.1,6.2,6.3,
6.4,7.0,7.1,7.2}/ return HTTP 200.
- X.Y.Z patch directories exist for 7.0.2, 7.1.1, 7.2.1 but NOT for
6.3.0, 6.4.0, 6.2.1 -- install.sh already handles this via the X.Y.Z
-> X.Y fallback sed in the Radeon wheel install block.
- Docs links (rocm.docs.amd.com, docs.unsloth.ai AMD guide) and the
llama.cpp GitHub releases API endpoint all return 200.
Test suite: 255 -> 258. New regression coverage:
- U17: get_physical_gpu_count tolerates scalar amd-smi envelope
- U18: get_visible_gpu_utilization tolerates scalar envelope
- U19a-c: vram_util / power_util return None on zero total, but
vram_total_gb still echoes 0.0 (not None)
- A_rocm{6.5,6.6,6.9}_clips_to_rocm64: install.sh clips unsupported
6.x minors to rocm6.4 instead of producing a 403 index URL
This commit is contained in:
parent
1d387d6746
commit
7effb3aee8
2 changed files with 45 additions and 17 deletions
20
install.sh
20
install.sh
|
|
@ -1068,13 +1068,23 @@ get_torch_index_url() {
|
|||
case "$_rocm_tag" in
|
||||
rocm[1-5].*) echo "$_base/cpu"; return ;;
|
||||
esac
|
||||
# ROCm 7.2 only has torch 2.11.0 which exceeds current bounds (<2.11.0).
|
||||
# Fall back to rocm7.1 index which has torch 2.10.0.
|
||||
# TODO: uncomment the next line when torch upper bound is bumped to >=2.11.0
|
||||
# echo "$_base/$_rocm_tag"; return
|
||||
# ROCm 7.2 only has torch 2.11.0 which exceeds current bounds
|
||||
# (<2.11.0). Fall back to rocm7.1 index which has torch 2.10.0.
|
||||
# Enumerate explicit versions rather than matching rocm6.* so
|
||||
# a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is
|
||||
# clipped down to the last supported 6.x (rocm6.4) instead of
|
||||
# constructing https://download.pytorch.org/whl/rocm6.5 which
|
||||
# returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2,
|
||||
# 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum).
|
||||
# TODO: uncomment rocm7.2 when the torch upper bound is bumped
|
||||
# to >=2.11.0.
|
||||
case "$_rocm_tag" in
|
||||
rocm6.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*)
|
||||
rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*)
|
||||
echo "$_base/$_rocm_tag" ;;
|
||||
rocm6.*)
|
||||
# ROCm 6.5+ (no published PyTorch wheels): clip down
|
||||
# to the last supported 6.x wheel set.
|
||||
echo "$_base/rocm6.4" ;;
|
||||
*)
|
||||
# ROCm 7.2+ (including future 10.x+): cap to rocm7.1
|
||||
echo "$_base/rocm7.1" ;;
|
||||
|
|
|
|||
|
|
@ -172,12 +172,16 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]:
|
|||
)
|
||||
vram_util = (
|
||||
round((vram_used_mb / vram_total_mb) * 100, 1)
|
||||
if vram_used_mb is not None and vram_total_mb and vram_total_mb > 0
|
||||
if vram_used_mb is not None
|
||||
and vram_total_mb is not None
|
||||
and vram_total_mb > 0
|
||||
else None
|
||||
)
|
||||
power_util = (
|
||||
round((power_draw / power_limit) * 100, 1)
|
||||
if power_draw is not None and power_limit and power_limit > 0
|
||||
if power_draw is not None
|
||||
and power_limit is not None
|
||||
and power_limit > 0
|
||||
else None
|
||||
)
|
||||
|
||||
|
|
@ -212,7 +216,11 @@ def get_physical_gpu_count() -> Optional[int]:
|
|||
return None
|
||||
if isinstance(data, list):
|
||||
return len(data)
|
||||
# Some versions return a dict with a "gpu" key
|
||||
# Some versions return a dict with a "gpu" / "gpus" key. Guard the
|
||||
# .get() access with an isinstance check so a malformed scalar /
|
||||
# string response from amd-smi cannot raise AttributeError.
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
gpus = data.get("gpu", data.get("gpus", []))
|
||||
if isinstance(gpus, list):
|
||||
return len(gpus)
|
||||
|
|
@ -301,25 +309,35 @@ def get_visible_gpu_utilization(
|
|||
"index_kind": "physical",
|
||||
}
|
||||
|
||||
gpu_list = (
|
||||
data if isinstance(data, list) else data.get("gpus", data.get("gpu", [data]))
|
||||
)
|
||||
# Extract a device list from amd-smi's envelope. Newer versions return
|
||||
# a JSON array directly, older versions return a dict with a "gpus" /
|
||||
# "gpu" key wrapping the list. Guard non-dict / non-list envelopes
|
||||
# (scalar / string fallbacks from malformed output) so the .get()
|
||||
# access cannot raise AttributeError on an unexpected shape.
|
||||
if isinstance(data, list):
|
||||
gpu_list = data
|
||||
elif isinstance(data, dict):
|
||||
gpu_list = data.get("gpus", data.get("gpu", [data]))
|
||||
else:
|
||||
gpu_list = [data]
|
||||
visible_set = set(parent_visible_ids)
|
||||
ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)}
|
||||
|
||||
devices = []
|
||||
for fallback_idx, gpu_data in enumerate(gpu_list):
|
||||
# Skip non-dict entries defensively: if amd-smi ever ships a
|
||||
# scalar inside its "gpus" array (observed on some malformed
|
||||
# output), _extract_gpu_metrics would raise AttributeError on
|
||||
# the first .get() call.
|
||||
if not isinstance(gpu_data, dict):
|
||||
continue
|
||||
# Use AMD-reported GPU ID when available, fall back to enumeration
|
||||
# index. Newer amd-smi versions wrap scalars as ``{"value": 0,
|
||||
# "unit": "none"}``, so route raw_id through ``_parse_numeric``
|
||||
# which already handles bare ints, floats, strings, and that
|
||||
# dict shape uniformly.
|
||||
raw_id = (
|
||||
gpu_data.get(
|
||||
"gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx))
|
||||
)
|
||||
if isinstance(gpu_data, dict)
|
||||
else fallback_idx
|
||||
raw_id = gpu_data.get(
|
||||
"gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx))
|
||||
)
|
||||
parsed_id = _parse_numeric(raw_id)
|
||||
if parsed_id is None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue