mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Auto-configure AMDGPU_ASIC_ID_TABLE_PATH on ROCm startup (#4060)
* Auto-configure AMDGPU_ASIC_ID_TABLE_PATH on ROCm startup * Remove ROCm fd2 amdgpu.ids noise filter wrappers * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use PyPI bitsandbytes for amd extra to avoid malformed wheel URL * Add amd-preview extra for bitsandbytes continuous wheel channel * Keep amd extra on bitsandbytes>=0.49.1 and remove amd-preview --------- Co-authored-by: Daniel Hanchen <danielhanchen@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
842099f2b0
commit
defcbf8bea
3 changed files with 84 additions and 85 deletions
|
|
@ -994,9 +994,8 @@ intel = [
|
|||
]
|
||||
amd = [
|
||||
"unsloth[huggingfacenotorch]",
|
||||
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
|
||||
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
|
||||
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl ; ('linux' in sys_platform) and (platform_machine == 'aarch64')",
|
||||
"bitsandbytes>=0.49.1 ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64' or platform_machine == 'aarch64')",
|
||||
"bitsandbytes>=0.49.1 ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
|
|
|||
|
|
@ -29,18 +29,21 @@ from .import_fixes import (
|
|||
fix_message_factory_issue,
|
||||
check_fbgemm_gpu_version,
|
||||
disable_broken_causal_conv1d,
|
||||
_filter_rocm_amdgpu_ids_fd2_noise,
|
||||
configure_amdgpu_asic_id_table_path,
|
||||
torchvision_compatibility_check,
|
||||
fix_diffusers_warnings,
|
||||
fix_huggingface_hub,
|
||||
)
|
||||
|
||||
# Configure libdrm ids table path early so ROCm can resolve AMD GPU names.
|
||||
configure_amdgpu_asic_id_table_path()
|
||||
disable_broken_causal_conv1d()
|
||||
fix_message_factory_issue()
|
||||
check_fbgemm_gpu_version()
|
||||
torchvision_compatibility_check()
|
||||
fix_diffusers_warnings()
|
||||
fix_huggingface_hub()
|
||||
del configure_amdgpu_asic_id_table_path
|
||||
del disable_broken_causal_conv1d
|
||||
del fix_message_factory_issue
|
||||
del check_fbgemm_gpu_version
|
||||
|
|
@ -96,9 +99,7 @@ try:
|
|||
# os.system("pip install --upgrade --no-cache-dir --no-deps --user unsloth_zoo")
|
||||
# except:
|
||||
# raise ImportError("Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`")
|
||||
# Filter native fd=2 amdgpu.ids noise during early unsloth_zoo import.
|
||||
with _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
import unsloth_zoo
|
||||
import unsloth_zoo
|
||||
except PackageNotFoundError:
|
||||
raise ImportError(
|
||||
f"Unsloth: Please install unsloth_zoo via `pip install unsloth_zoo` then retry!"
|
||||
|
|
@ -109,9 +110,7 @@ del PackageNotFoundError, importlib_version
|
|||
|
||||
# Try importing PyTorch and check version
|
||||
try:
|
||||
# Filter native fd=2 amdgpu.ids noise during torch import on ROCm.
|
||||
with _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
import torch
|
||||
import torch
|
||||
except ModuleNotFoundError:
|
||||
raise ImportError(
|
||||
"Unsloth: Pytorch is not installed. Go to https://pytorch.org/.\n"
|
||||
|
|
@ -120,16 +119,14 @@ except ModuleNotFoundError:
|
|||
except:
|
||||
raise
|
||||
|
||||
# Filter native fd=2 amdgpu.ids noise during early device detection import.
|
||||
with _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
from unsloth_zoo.device_type import (
|
||||
is_hip,
|
||||
get_device_type,
|
||||
DEVICE_TYPE,
|
||||
DEVICE_TYPE_TORCH,
|
||||
DEVICE_COUNT,
|
||||
ALLOW_PREQUANTIZED_MODELS,
|
||||
)
|
||||
from unsloth_zoo.device_type import (
|
||||
is_hip,
|
||||
get_device_type,
|
||||
DEVICE_TYPE,
|
||||
DEVICE_TYPE_TORCH,
|
||||
DEVICE_COUNT,
|
||||
ALLOW_PREQUANTIZED_MODELS,
|
||||
)
|
||||
|
||||
# Fix other issues
|
||||
from .import_fixes import (
|
||||
|
|
@ -305,10 +302,8 @@ elif DEVICE_TYPE == "xpu":
|
|||
# TODO: check triton for intel installed properly.
|
||||
pass
|
||||
|
||||
# Filter native fd=2 amdgpu.ids noise during model import startup.
|
||||
with _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
from .models import *
|
||||
from .models import __version__
|
||||
from .models import *
|
||||
from .models import __version__
|
||||
from .save import *
|
||||
from .chat_templates import *
|
||||
from .tokenizer_utils import *
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ import os
|
|||
import importlib.abc
|
||||
import importlib.machinery
|
||||
import importlib.util
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
from importlib.metadata import version as importlib_version
|
||||
from packaging.version import Version as TrueVersion
|
||||
|
|
@ -26,7 +25,6 @@ import textwrap
|
|||
import warnings
|
||||
import sys
|
||||
import functools
|
||||
import tempfile
|
||||
|
||||
# We cannot do from unsloth_zoo.log import logger since FBGEMM might cause seg faults.
|
||||
UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") in (
|
||||
|
|
@ -1191,6 +1189,13 @@ _ROCM_PATH_HINTS = (
|
|||
Path("/dev/kfd"),
|
||||
Path("/sys/module/amdgpu"),
|
||||
)
|
||||
_AMDGPU_ASIC_ID_TABLE_PATH_ENV = "AMDGPU_ASIC_ID_TABLE_PATH"
|
||||
_AMDGPU_ASIC_ID_CANDIDATE_PATHS = (
|
||||
Path("/usr/share/libdrm/amdgpu.ids"),
|
||||
Path("/usr/local/share/libdrm/amdgpu.ids"),
|
||||
Path("/opt/rocm/share/libdrm/amdgpu.ids"),
|
||||
Path("/opt/amdgpu/share/libdrm/amdgpu.ids"),
|
||||
)
|
||||
|
||||
|
||||
def _log_rocm_detection(message):
|
||||
|
|
@ -1236,68 +1241,70 @@ def _is_rocm_torch_build() -> bool:
|
|||
return False
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _filter_stderr_fd(
|
||||
suppressed_substrings = (_AMDGPU_IDS_MISSING_TEXT,),
|
||||
):
|
||||
"""
|
||||
Capture low-level fd=2 writes, drop only known noisy substrings, and replay
|
||||
everything else after the protected block.
|
||||
"""
|
||||
saved_stderr_fd = None
|
||||
temp_file = None
|
||||
redirected = False
|
||||
def _iter_amdgpu_asic_id_table_candidates():
|
||||
# Try torch-adjacent ids table paths first without importing torch.
|
||||
try:
|
||||
saved_stderr_fd = os.dup(2)
|
||||
temp_file = tempfile.TemporaryFile(mode = "w+b")
|
||||
os.dup2(temp_file.fileno(), 2)
|
||||
redirected = True
|
||||
torch_spec = importlib.util.find_spec("torch")
|
||||
except Exception:
|
||||
redirected = False
|
||||
torch_spec = None
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
captured = b""
|
||||
if redirected and temp_file is not None:
|
||||
try:
|
||||
temp_file.flush()
|
||||
temp_file.seek(0)
|
||||
captured = temp_file.read()
|
||||
except Exception:
|
||||
captured = b""
|
||||
if redirected and saved_stderr_fd is not None:
|
||||
try:
|
||||
os.dup2(saved_stderr_fd, 2)
|
||||
except Exception:
|
||||
pass
|
||||
if captured and saved_stderr_fd is not None:
|
||||
try:
|
||||
for raw_line in captured.splitlines(keepends = True):
|
||||
line = raw_line.decode("utf-8", errors = "ignore")
|
||||
if any(s in line for s in suppressed_substrings):
|
||||
continue
|
||||
os.write(saved_stderr_fd, raw_line)
|
||||
except Exception:
|
||||
pass
|
||||
if temp_file is not None:
|
||||
try:
|
||||
temp_file.close()
|
||||
except Exception:
|
||||
pass
|
||||
if saved_stderr_fd is not None:
|
||||
try:
|
||||
os.close(saved_stderr_fd)
|
||||
except Exception:
|
||||
pass
|
||||
roots = []
|
||||
if torch_spec is not None:
|
||||
if torch_spec.origin:
|
||||
roots.append(Path(torch_spec.origin).resolve().parent)
|
||||
if torch_spec.submodule_search_locations:
|
||||
for location in torch_spec.submodule_search_locations:
|
||||
roots.append(Path(location).resolve())
|
||||
|
||||
seen = set()
|
||||
for root in roots:
|
||||
for candidate in (
|
||||
root / "share" / "libdrm" / "amdgpu.ids",
|
||||
root.parent / "share" / "libdrm" / "amdgpu.ids",
|
||||
root.parent.parent / "share" / "libdrm" / "amdgpu.ids",
|
||||
):
|
||||
candidate_str = str(candidate)
|
||||
if candidate_str in seen:
|
||||
continue
|
||||
seen.add(candidate_str)
|
||||
yield candidate
|
||||
|
||||
for candidate in _AMDGPU_ASIC_ID_CANDIDATE_PATHS:
|
||||
candidate_str = str(candidate)
|
||||
if candidate_str in seen:
|
||||
continue
|
||||
seen.add(candidate_str)
|
||||
yield candidate
|
||||
|
||||
|
||||
def _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
# ROCm/libdrm can emit amdgpu.ids missing errors via low-level fd=2 writes.
|
||||
# Python-level stderr filters cannot intercept those writes.
|
||||
def configure_amdgpu_asic_id_table_path():
|
||||
# Honor an existing valid user-provided path.
|
||||
configured = os.environ.get(_AMDGPU_ASIC_ID_TABLE_PATH_ENV, "").strip()
|
||||
if configured:
|
||||
configured_path = Path(configured)
|
||||
try:
|
||||
if configured_path.is_file():
|
||||
return str(configured_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Only attempt this on ROCm-like environments.
|
||||
if not _is_rocm_torch_build():
|
||||
return contextlib.nullcontext()
|
||||
return _filter_stderr_fd()
|
||||
return None
|
||||
|
||||
for candidate in _iter_amdgpu_asic_id_table_candidates():
|
||||
try:
|
||||
if candidate.is_file():
|
||||
os.environ[_AMDGPU_ASIC_ID_TABLE_PATH_ENV] = str(candidate)
|
||||
if UNSLOTH_ENABLE_LOGGING:
|
||||
logger.info(
|
||||
f"Unsloth: Set {_AMDGPU_ASIC_ID_TABLE_PATH_ENV}={candidate}"
|
||||
)
|
||||
return str(candidate)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _is_causal_conv1d_name(module_name: str) -> bool:
|
||||
|
|
@ -1432,9 +1439,7 @@ def disable_broken_causal_conv1d():
|
|||
return
|
||||
|
||||
try:
|
||||
# Suppress only native fd=2 amdgpu.ids noise during causal_conv1d probe.
|
||||
with _filter_rocm_amdgpu_ids_fd2_noise():
|
||||
import causal_conv1d # noqa: F401
|
||||
import causal_conv1d # noqa: F401
|
||||
|
||||
return
|
||||
except Exception as error:
|
||||
|
|
|
|||
Loading…
Reference in a new issue