From f26948b4930c8c58d23c13025f6d5ea1132f32e7 Mon Sep 17 00:00:00 2001
From: danielhanchen <danielhanchen@users.noreply.github.com>
Date: Thu, 8 Jan 2026 04:14:53 +0000
Subject: [PATCH 1/2] Fix FBGEMM/CUTLASS errors on SM100 (Blackwell) GPUs

This PR fixes the "Arch conditional MMA instruction used without targeting
appropriate compute capability. Aborting." errors that occur when using
FBGEMM on Blackwell GPUs (B200/B100, SM100).

Changes:
- Add stderr filters in import_fixes.py for CUTLASS/FBGEMM MMA errors
- Add warning filters for various deprecation messages
- Update check_fbgemm_gpu_version() to disable FBGEMM instead of raising
  an error when old versions are detected
- Update test_has_fbgemm() in fp8.py to catch broader CUTLASS/CUDA errors
  and gracefully fall back to Triton kernels
- Update loader_utils.py to disable FBGEMM instead of raising ValueError
  for old fbgemm_gpu versions

The key behavior change is that FBGEMM errors no longer crash the script.
Instead, FBGEMM is disabled and Triton kernels are used automatically.
This allows Unsloth to work on SM100 GPUs where CUTLASS SM90 kernels fail,
and also gracefully handles old FBGEMM versions.
---
 unsloth/import_fixes.py        | 34 +++++++++++++++++++++++++++++++---
 unsloth/kernels/fp8.py         | 22 +++++++++++++++++++---
 unsloth/models/loader_utils.py | 10 +++++++---
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 1e05e462e..5ad341e2a 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -94,16 +94,40 @@ class HidePrintMessage:
 if os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") != "1":
     import sys
 
-    # Apply to stderr for FBGEMM
+    # Apply to stderr for FBGEMM and CUTLASS errors
     sys.stderr = HidePrintMessage(sys.stderr)
     # https://github.com/pytorch/FBGEMM/blob/d99cd96490ec4aabac2ee95b1e76ea4dcfcfa628/fbgemm_gpu/experimental/gemm/triton_gemm/utils.py#L43-L52
     sys.stderr.add_filter("TMA benchmarks will be running")
+    # CUTLASS/FBGEMM MMA instruction error on SM90 vs SM100 (Blackwell) GPUs
+    # https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+    sys.stderr.add_filter("Arch conditional MMA instruction used without targeting")
+    # CUTLASS arch conditional errors for various architectures
+    sys.stderr.add_filter("CUTE_INVALID_CONTROL_PATH")
+    # CUTLASS TMA-related errors when not targeting correct architecture
+    sys.stderr.add_filter("Trying to use tma without CUTE_ARCH_TMA")
     # Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu128 for torchao version 0.15.0
     logging.getLogger("torchao").setLevel(logging.ERROR)
+    # Also filter torchao print to stderr about cpp extensions
+    sys.stderr.add_filter("Skipping import of cpp extensions")
     # SyntaxWarning: invalid escape sequence '\.'
     warnings.filterwarnings(
         "ignore", message = "invalid escape sequence", category = SyntaxWarning
     )
+    # PYTORCH_CUDA_ALLOC_CONF is deprecated warning from torch
+    warnings.filterwarnings(
+        "ignore", message = "PYTORCH_CUDA_ALLOC_CONF is deprecated"
+    )
+    # TF32 precision deprecation warning from torch
+    warnings.filterwarnings(
+        "ignore", message = "Please use the new API settings to control TF32"
+    )
+    # Deprecation warnings from torchao
+    warnings.filterwarnings(
+        "ignore", message = "`int4_weight_only` is deprecated"
+    )
+    warnings.filterwarnings(
+        "ignore", message = "`int8_weight_only` is deprecated"
+    )
 
 
 # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
@@ -323,10 +347,14 @@ def check_fbgemm_gpu_version():
     except:
         return
     # We noticed some SegFault or bad alloc errors on lower versions of fbgemm_gpu.
+    # Instead of raising an error, disable FBGEMM and fall back to Triton kernels.
     if Version(fbgemm_gpu_version) < Version("1.4.0"):
-        raise ImportError(
-            f"Unsloth: fbgemm_gpu_genai=={fbgemm_gpu_version} detected. It might cause unexpected issues like segmentation faults. Please uninstall the current one by doing `pip uninstall fbgemm-gpu` && `pip install fbgemm-gpu` to install fbgemm-gpu 1.4.0 or newer!"
+        os.environ["UNSLOTH_HAS_FBGEMM"] = "0"
+        logger.info(
+            f"Unsloth: fbgemm_gpu_genai=={fbgemm_gpu_version} is old and may cause issues. "
+            f"Disabling FBGEMM - using Triton kernels instead."
         )
+        return
 
     logger.info(f"Unsloth: fbgemm_gpu_genai=={fbgemm_gpu_version} detected.")
 
diff --git a/unsloth/kernels/fp8.py b/unsloth/kernels/fp8.py
index 3093bf61b..e9f916170 100644
--- a/unsloth/kernels/fp8.py
+++ b/unsloth/kernels/fp8.py
@@ -523,6 +523,7 @@ def fp8_fbgemm_block_linear(X, weight, weight_scale, bias = None):
 def test_has_fbgemm():
     # We must manually check if the faster FBGEMM works on the specific GPU
     # For example RTX 5090 and RTX 4090 does not work
+    # Also SM100 (Blackwell B200/B100) GPUs fail with CUTLASS SM90 kernels
     # [TODO] Investigate with TorchAO why FBGEMM fails on consumer GPUs
     M, N, K = 128, 128, 128
     xq = torch.ones(M, K, dtype = torch.float8_e4m3fn, device = "cuda")
@@ -537,10 +538,25 @@ def test_has_fbgemm():
         has_fbgemm = True
         del out
     except Exception as e:
-        e = str(e)
-        if "cutlass cannot initialize" in e.lower():
+        error_str = str(e).lower()
+        # Catch any CUTLASS/CUDA errors and disable FBGEMM
+        # This includes MMA instruction errors, architecture mismatches, kernel launch failures, etc.
+        cutlass_cuda_errors = (
+            "cutlass",
+            "cuda error",
+            "cuda runtime error",
+            "no kernel image",
+            "arch conditional",
+            "mma instruction",
+            "compute capability",
+            "cute_invalid_control_path",
+            "tma",
+        )
+        is_cutlass_cuda_error = any(err in error_str for err in cutlass_cuda_errors)
+
+        if is_cutlass_cuda_error:
             print(
-                f"Unsloth: FBGEMM on the current GPU cannot load - will switch to Triton kernels"
+                "Unsloth: FBGEMM on the current GPU cannot load - will switch to Triton kernels"
             )
         else:
             print(
diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py
index fe2a89d89..9656cc9d2 100644
--- a/unsloth/models/loader_utils.py
+++ b/unsloth/models/loader_utils.py
@@ -408,7 +408,7 @@ def _get_fp8_mode_and_check_settings(
     if Version(torchao.__version__) < Version("0.15.0"):
         raise ValueError(error_message)
 
-    # If fbgemm_gpu_genai is installed, check if it's >= 1.4.1
+    # If fbgemm_gpu_genai is installed and old, disable FBGEMM and use Triton instead
     if (
         importlib.util.find_spec("fbgemm_gpu") is not None
         and importlib.util.find_spec("fbgemm_gpu.experimental") is not None
@@ -416,7 +416,11 @@ def _get_fp8_mode_and_check_settings(
         import fbgemm_gpu.experimental.gen_ai
 
         if Version(fbgemm_gpu.__version__) < Version("1.4.1"):
-            raise ValueError(
-                "Unsloth: On the fly `load_in_fp8` is only compatible with fbgemm_gpu_genai 1.4.1+. Try `unsloth/Qwen3-8B` instead."
+            # Old FBGEMM version - disable and use Triton kernels instead
+            os.environ["UNSLOTH_HAS_FBGEMM"] = "0"
+            from unsloth_zoo.log import logger
+            logger.info(
+                f"Unsloth: fbgemm_gpu_genai=={fbgemm_gpu.__version__} is old for FP8 loading. "
+                f"Using Triton kernels instead."
             )
     return fp8_mode

From 2ee55010d3add51428ef8025ca99aa108565ac3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 8 Jan 2026 04:15:17 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/import_fixes.py        | 12 +++---------
 unsloth/models/loader_utils.py |  1 +
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 5ad341e2a..27e5342e2 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -114,20 +114,14 @@ if os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") != "1":
         "ignore", message = "invalid escape sequence", category = SyntaxWarning
     )
     # PYTORCH_CUDA_ALLOC_CONF is deprecated warning from torch
-    warnings.filterwarnings(
-        "ignore", message = "PYTORCH_CUDA_ALLOC_CONF is deprecated"
-    )
+    warnings.filterwarnings("ignore", message = "PYTORCH_CUDA_ALLOC_CONF is deprecated")
     # TF32 precision deprecation warning from torch
     warnings.filterwarnings(
         "ignore", message = "Please use the new API settings to control TF32"
     )
     # Deprecation warnings from torchao
-    warnings.filterwarnings(
-        "ignore", message = "`int4_weight_only` is deprecated"
-    )
-    warnings.filterwarnings(
-        "ignore", message = "`int8_weight_only` is deprecated"
-    )
+    warnings.filterwarnings("ignore", message = "`int4_weight_only` is deprecated")
+    warnings.filterwarnings("ignore", message = "`int8_weight_only` is deprecated")
 
 
 # Fix up AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py
index 9656cc9d2..1e5533c25 100644
--- a/unsloth/models/loader_utils.py
+++ b/unsloth/models/loader_utils.py
@@ -419,6 +419,7 @@ def _get_fp8_mode_and_check_settings(
             # Old FBGEMM version - disable and use Triton kernels instead
             os.environ["UNSLOTH_HAS_FBGEMM"] = "0"
             from unsloth_zoo.log import logger
+
             logger.info(
                 f"Unsloth: fbgemm_gpu_genai=={fbgemm_gpu.__version__} is old for FP8 loading. "
                 f"Using Triton kernels instead."