Many bug fixes (#754)

* Update gemma2.py * Update llama.py * Update llama.py * Update gemma2.py * init * Update gemma2.py * Update gemma2.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update _utils.py * Update gemma2.py * Update gemma2.py * Update gemma2.py * All RoPE Scaling support * cleanup * Update llama.py * Update llama.py * Update _utils.py * Update _utils.py * exec * exec * Attention_Module * attention_module * imports * exec * Update llama.py * Update llama.py * boolean mask * revert masking * Update llama.py * Update save.py * Update llama.py * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update utils.py * retry * Update gemma2.py * Update gemma2.py * Update gemma2.py * Update _utils.py * Update _utils.py * Update gemma2.py * Update chat_templates.py * Gemma 2 Ollama support * Update llama.py * Update llama.py * error handling * Update _utils.py * Update _utils.py * Stats for debugging * Update _utils.py * Update _utils.py * Debugging * Update tokenizer_utils.py * Update _utils.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Check exec, eval * Update _utils.py * Update _utils.py * Images * Bug fixes * Update pyproject.toml * Bug fixes * Update _utils.py * Update _utils.py
2026-04-21 13:37:39 +00:00 · 2024-07-10 01:59:06 -07:00 · 2024-07-10 01:59:06 -07:00 · f176cbd36a
commit f176cbd36a
parent 316aaefdf2
10 changed files with 110 additions and 44 deletions
--- a/images/Assistant.png
+++ b/images/Assistant.png
--- a/images/Terminal_Type.png
+++ b/images/Terminal_Type.png
--- a/images/Where_Terminal.png
+++ b/images/Where_Terminal.png
--- a/pyproject.toml
+++ b/pyproject.toml
@ -33,6 +33,7 @@ exclude = ["images*"]

 [project.optional-dependencies]
 huggingface = [
+    "packaging",
    "tyro",
    "transformers>=4.42.3",
    "datasets>=2.16.0",
@ -184,6 +185,7 @@ colab-ampere-torch220 = [
    "flash-attn",
 ]
 colab-new = [
+    "packaging",
    "tyro",
    "transformers>=4.42.3",
    "datasets>=2.16.0",
@ -198,7 +200,7 @@ colab-no-deps = [
    "accelerate>=0.26.1",
    "trl>=0.7.9",
    "peft>=0.7.1",
-    "xformers",
+    "xformers<0.0.27",
    "bitsandbytes",
    "protobuf<4.0.0",
 ]
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@ -43,6 +43,7 @@ from platform import system as platform_system
 platform_system = platform_system()
 import numpy as np
 import warnings, subprocess, re, inspect, psutil, os, math
+from packaging.version import Version

 # =============================================
 # Disable some warnings which can get annoying
@ -126,6 +127,23 @@ pass
 import xformers.ops.fmha as xformers
 xformers_attention = xformers.memory_efficient_attention
 from xformers import __version__ as xformers_version
+# Temporarily disable 0.0.27 and higher - inference issues
+if Version(xformers_version) >= Version("0.0.27"):
+    raise ImportError(
+        f"Unsloth: Your xformers version of {xformers_version} is too new.\n"\
+        'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
+    )
+pass
+
+# Check TRL version
+from trl import __version__ as trl_version
+if Version(xformers_version) >= Version("0.9.0"):
+    raise ImportError(
+        f"Unsloth: Your TRL version of {trl_version} is too new.\n"\
+        'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"'
+    )
+pass
+
 # =============================================

 # =============================================
@ -696,12 +714,14 @@ pass

 def check_nvidia():
    # Unsloth doesn't work yet on AMD devices - we're working on it!
+    output = np.array([0,])
    try:
        output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
+        output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
+        output = np.array([int(x.decode('utf-8'))/1024 for x in output])
    except:
-        raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
-    output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
-    output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+        if not torch.cuda.is_available():
+            raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")    
    return output
 pass
 PRE_CHECK = check_nvidia()
--- a/unsloth/models/gemma.py
+++ b/unsloth/models/gemma.py
@ -15,15 +15,29 @@
 from .llama import *
 from ._utils import __version__

-from transformers.models.gemma.modeling_gemma import (
-    GemmaAttention,
-    GemmaDecoderLayer,
-    GemmaModel,
-    GemmaForCausalLM,
-    GemmaRotaryEmbedding,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
+try:
+    from transformers.models.gemma.modeling_gemma import (
+        GemmaAttention,
+        GemmaDecoderLayer,
+        GemmaModel,
+        GemmaForCausalLM,
+        GemmaRotaryEmbedding,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+except:
+    from packaging.version import Version
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.38"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
+            f"The minimum required version is 4.38.\n"\
+            f'Try `pip install --upgrade "transformers>=4.38"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
 from transformers.modeling_attn_mask_utils import (
    _prepare_4d_causal_attention_mask_for_sdpa,
 )
--- a/unsloth/models/gemma2.py
+++ b/unsloth/models/gemma2.py
@ -19,15 +19,29 @@ from .gemma import (
    GemmaFixedLinearScalingRotaryEmbedding,
    fast_geglu_inference,
 )
-from transformers.models.gemma2.modeling_gemma2 import (
-    Gemma2Attention,
-    Gemma2DecoderLayer,
-    Gemma2Model,
-    Gemma2ForCausalLM,
-    Gemma2RotaryEmbedding,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
+try:
+    from transformers.models.gemma2.modeling_gemma2 import (
+        Gemma2Attention,
+        Gemma2DecoderLayer,
+        Gemma2Model,
+        Gemma2ForCausalLM,
+        Gemma2RotaryEmbedding,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+except:
+    from packaging.version import Version
+    transformers_version = Version(transformers_version)
+    if not transformers_version >= Version("4.42"):
+        raise ImportError(
+            f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
+            f"The minimum required version is 4.42.3.\n"\
+            f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
+            f"to obtain the latest transformers build, then restart this session."\
+        )
+    pass
+pass
+
 from transformers.modeling_attn_mask_utils import (
    _prepare_4d_causal_attention_mask_for_sdpa,
 )
@ -46,7 +60,7 @@ pass
 # [TODO] We must randomnly use torch.compile?
 # I checked the gradients and formulas and I'm sure it's correct.
 # I'm stumped :(
-@torch.compile(fullgraph = True, dynamic = True)#, options = torch_compile_options)
+@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
 def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
    old_dtype = X.dtype
    X = X.float()
@ -70,7 +84,11 @@ def gemma2_attention(Q, K, V, causal_mask, self, bsz, q_len):
    K = K.reshape(bsz, n_heads, q_len, head_dim)
    V = V.reshape(bsz, n_heads, q_len, head_dim)

-    s = self.config.hidden_size // self.config.num_attention_heads
+    # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+    # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+    # We default to using the config file itself
+    # s = self.config.hidden_size // self.config.num_attention_heads
+    s = self.config.query_pre_attn_scalar
    t = self.config.attn_logit_softcapping

    Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
@ -260,7 +278,13 @@ def Gemma2Attention_fast_forward_inference(
        # Only for Gemma2
        self.temp_O  = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
        self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
-        self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
+        
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        self.scalar = 1.0 / math_sqrt(self.config.query_pre_attn_scalar)
+        # self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
        self.half_head_dim = head_dim // 2
        self.           t =       self.config.attn_logit_softcapping
        self.reciprocal_t = 1.0 / self.config.attn_logit_softcapping
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@ -1276,12 +1276,14 @@ class FastLlamaModel:
        f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
        logger.warning(debug_info)
        import subprocess, re, gc, numpy as np
+        a = np.array([0,])
        try:
            a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
+            a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
+            a = np.array([int(x.decode('utf-8'))/1024 for x in a])
        except:
-            raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
-        a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
-        a = np.array([int(x.decode('utf-8'))/1024 for x in a])
+            if not torch.cuda.is_available():
+                raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
        if ((a - PRE_CHECK) >= 1).sum() > 1:
            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
        for _ in range(3):
--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@ -22,16 +22,16 @@ from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER
 import os

 # https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
-major, minor = transformers_version.split(".")[:2]
-major, minor = int(major), int(minor)
-SUPPORTS_FOURBIT = (major > 4) or (major == 4 and minor >= 37)
-SUPPORTS_GEMMA   = (major > 4) or (major == 4 and minor >= 38)
-SUPPORTS_GEMMA2  = (major > 4) or (major == 4 and minor >= 42)
+from packaging.version import Version
+transformers_version = Version(transformers_version)
+SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
+SUPPORTS_GEMMA   = transformers_version >= Version("4.38")
+SUPPORTS_GEMMA2  = transformers_version >= Version("4.42")
 if SUPPORTS_GEMMA:
    from .gemma  import FastGemmaModel
 if SUPPORTS_GEMMA2:
    from .gemma2 import FastGemma2Model
-del major, minor
+pass


 def _get_model_name(model_name, load_in_4bit = True):
@ -134,7 +134,7 @@ class FastLanguageModel(FastLlamaModel):
        elif model_type == "mistral": dispatch_model = FastMistralModel
        elif model_type == "gemma":
            if not SUPPORTS_GEMMA:
-                raise RuntimeError(
+                raise ImportError(
                    f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
                    f"The minimum required version is 4.38.\n"\
                    f'Try `pip install --upgrade "transformers>=4.38"`\n'\
@ -143,10 +143,10 @@ class FastLanguageModel(FastLlamaModel):
            dispatch_model = FastGemmaModel
        elif model_type == "gemma2":
            if not SUPPORTS_GEMMA2:
-                raise RuntimeError(
+                raise ImportError(
                    f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
-                    f"The minimum required version is 4.43.\n"\
-                    f'Try `pip install --upgrade "transformers>=4.43"`\n'\
+                    f"The minimum required version is 4.42.3.\n"\
+                    f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
                    f"to obtain the latest transformers build, then restart this session."\
                )
            dispatch_model = FastGemma2Model
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@ -910,12 +910,14 @@ pass

 def check_nvidia():
    # Unsloth doesn't work yet on AMD devices - we're working on it!
+    output = np.array([0,])
    try:
        output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
+        output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
+        output = np.array([int(x.decode('utf-8'))/1024 for x in output])
    except:
-        raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
-    output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
-    output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+        if not torch.cuda.is_available():
+            raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
    return output
 pass
 PRE_CHECK = check_nvidia()
@ -972,12 +974,14 @@ def patch_sft_trainer_tokenizer():
    "    )\n"\
    "pass\n"\
    "import subprocess, re, gc, numpy as np\n"\
+    "a = np.array([0,])\n"\
    "try:\n"\
    "    a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
+    "    a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
+    "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
    "except:\n"\
-    "    raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
-    "a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
-    "a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
+    "    if not torch.cuda.is_available():\n"\
+    "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
    "if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
    "    raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
    "for _ in range(3):\n"\