Many bug fixes (#754)

* Update gemma2.py

* Update llama.py

* Update llama.py

* Update gemma2.py

* init

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update _utils.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* All RoPE Scaling support

* cleanup

* Update llama.py

* Update llama.py

* Update _utils.py

* Update _utils.py

* exec

* exec

* Attention_Module

* attention_module

* imports

* exec

* Update llama.py

* Update llama.py

* boolean mask

* revert masking

* Update llama.py

* Update save.py

* Update llama.py

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update utils.py

* retry

* Update gemma2.py

* Update gemma2.py

* Update gemma2.py

* Update _utils.py

* Update _utils.py

* Update gemma2.py

* Update chat_templates.py

* Gemma 2 Ollama support

* Update llama.py

* Update llama.py

* error handling

* Update _utils.py

* Update _utils.py

* Stats for debugging

* Update _utils.py

* Update _utils.py

* Debugging

* Update tokenizer_utils.py

* Update _utils.py

* Update cross_entropy_loss.py

* Update cross_entropy_loss.py

* Update cross_entropy_loss.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Update rms_layernorm.py

* Check exec, eval

* Update _utils.py

* Update _utils.py

* Images

* Bug fixes

* Update pyproject.toml

* Bug fixes

* Update _utils.py

* Update _utils.py
This commit is contained in:
Daniel Han 2024-07-10 01:59:06 -07:00 committed by GitHub
parent 316aaefdf2
commit f176cbd36a
10 changed files with 110 additions and 44 deletions

BIN
images/Assistant.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

BIN
images/Terminal_Type.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

BIN
images/Where_Terminal.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 175 KiB

View file

@ -33,6 +33,7 @@ exclude = ["images*"]
[project.optional-dependencies]
huggingface = [
"packaging",
"tyro",
"transformers>=4.42.3",
"datasets>=2.16.0",
@ -184,6 +185,7 @@ colab-ampere-torch220 = [
"flash-attn",
]
colab-new = [
"packaging",
"tyro",
"transformers>=4.42.3",
"datasets>=2.16.0",
@ -198,7 +200,7 @@ colab-no-deps = [
"accelerate>=0.26.1",
"trl>=0.7.9",
"peft>=0.7.1",
"xformers",
"xformers<0.0.27",
"bitsandbytes",
"protobuf<4.0.0",
]

View file

@ -43,6 +43,7 @@ from platform import system as platform_system
platform_system = platform_system()
import numpy as np
import warnings, subprocess, re, inspect, psutil, os, math
from packaging.version import Version
# =============================================
# Disable some warnings which can get annoying
@ -126,6 +127,23 @@ pass
import xformers.ops.fmha as xformers
xformers_attention = xformers.memory_efficient_attention
from xformers import __version__ as xformers_version
# Temporarily disable 0.0.27 and higher - inference issues
if Version(xformers_version) >= Version("0.0.27"):
raise ImportError(
f"Unsloth: Your xformers version of {xformers_version} is too new.\n"\
'Please downgrade xformers via `pip install --force-reinstall "xformers<0.0.27"'
)
pass
# Check TRL version
from trl import __version__ as trl_version
if Version(xformers_version) >= Version("0.9.0"):
raise ImportError(
f"Unsloth: Your TRL version of {trl_version} is too new.\n"\
'Please downgrade TRL via `pip install --force-reinstall "trl<0.9.0"'
)
pass
# =============================================
# =============================================
@ -696,12 +714,14 @@ pass
def check_nvidia():
# Unsloth doesn't work yet on AMD devices - we're working on it!
output = np.array([0,])
try:
output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
output = np.array([int(x.decode('utf-8'))/1024 for x in output])
except:
raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
output = np.array([int(x.decode('utf-8'))/1024 for x in output])
if not torch.cuda.is_available():
raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
return output
pass
PRE_CHECK = check_nvidia()

View file

@ -15,15 +15,29 @@
from .llama import *
from ._utils import __version__
from transformers.models.gemma.modeling_gemma import (
GemmaAttention,
GemmaDecoderLayer,
GemmaModel,
GemmaForCausalLM,
GemmaRotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
try:
from transformers.models.gemma.modeling_gemma import (
GemmaAttention,
GemmaDecoderLayer,
GemmaModel,
GemmaForCausalLM,
GemmaRotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
except:
from packaging.version import Version
transformers_version = Version(transformers_version)
if not transformers_version >= Version("4.38"):
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
f"The minimum required version is 4.38.\n"\
f'Try `pip install --upgrade "transformers>=4.38"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
pass
pass
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask_for_sdpa,
)

View file

@ -19,15 +19,29 @@ from .gemma import (
GemmaFixedLinearScalingRotaryEmbedding,
fast_geglu_inference,
)
from transformers.models.gemma2.modeling_gemma2 import (
Gemma2Attention,
Gemma2DecoderLayer,
Gemma2Model,
Gemma2ForCausalLM,
Gemma2RotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
try:
from transformers.models.gemma2.modeling_gemma2 import (
Gemma2Attention,
Gemma2DecoderLayer,
Gemma2Model,
Gemma2ForCausalLM,
Gemma2RotaryEmbedding,
apply_rotary_pos_emb,
repeat_kv,
)
except:
from packaging.version import Version
transformers_version = Version(transformers_version)
if not transformers_version >= Version("4.42"):
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
f"The minimum required version is 4.42.3.\n"\
f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
pass
pass
from transformers.modeling_attn_mask_utils import (
_prepare_4d_causal_attention_mask_for_sdpa,
)
@ -46,7 +60,7 @@ pass
# [TODO] We must randomnly use torch.compile?
# I checked the gradients and formulas and I'm sure it's correct.
# I'm stumped :(
@torch.compile(fullgraph = True, dynamic = True)#, options = torch_compile_options)
@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def fast_rms_layernorm_gemma2_compiled(layernorm, X, gemma = True):
old_dtype = X.dtype
X = X.float()
@ -70,7 +84,11 @@ def gemma2_attention(Q, K, V, causal_mask, self, bsz, q_len):
K = K.reshape(bsz, n_heads, q_len, head_dim)
V = V.reshape(bsz, n_heads, q_len, head_dim)
s = self.config.hidden_size // self.config.num_attention_heads
# See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
# Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
# We default to using the config file itself
# s = self.config.hidden_size // self.config.num_attention_heads
s = self.config.query_pre_attn_scalar
t = self.config.attn_logit_softcapping
Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
@ -260,7 +278,13 @@ def Gemma2Attention_fast_forward_inference(
# Only for Gemma2
self.temp_O = torch.empty((1, bsz, self.hidden_size), dtype = dtype, device = "cuda:0")
self.attention = torch.empty((bsz, n_heads, 1, KV_CACHE_INCREMENT+seq_len), dtype = dtype, device = "cuda:0")
self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
# See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
# Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
# We default to using the config file itself
# s = self.config.hidden_size // self.config.num_attention_heads
self.scalar = 1.0 / math_sqrt(self.config.query_pre_attn_scalar)
# self.scalar = 1.0 / math_sqrt(self.config.hidden_size // self.config.num_attention_heads)
self.half_head_dim = head_dim // 2
self. t = self.config.attn_logit_softcapping
self.reciprocal_t = 1.0 / self.config.attn_logit_softcapping

View file

@ -1276,12 +1276,14 @@ class FastLlamaModel:
f' "-____-" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
logger.warning(debug_info)
import subprocess, re, gc, numpy as np
a = np.array([0,])
try:
a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
a = np.array([int(x.decode('utf-8'))/1024 for x in a])
except:
raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
a = np.array([int(x.decode('utf-8'))/1024 for x in a])
if not torch.cuda.is_available():
raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
if ((a - PRE_CHECK) >= 1).sum() > 1:
raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
for _ in range(3):

View file

@ -22,16 +22,16 @@ from .mapper import INT_TO_FLOAT_MAPPER, FLOAT_TO_INT_MAPPER
import os
# https://github.com/huggingface/transformers/pull/26037 allows 4 bit loading!
major, minor = transformers_version.split(".")[:2]
major, minor = int(major), int(minor)
SUPPORTS_FOURBIT = (major > 4) or (major == 4 and minor >= 37)
SUPPORTS_GEMMA = (major > 4) or (major == 4 and minor >= 38)
SUPPORTS_GEMMA2 = (major > 4) or (major == 4 and minor >= 42)
from packaging.version import Version
transformers_version = Version(transformers_version)
SUPPORTS_FOURBIT = transformers_version >= Version("4.37")
SUPPORTS_GEMMA = transformers_version >= Version("4.38")
SUPPORTS_GEMMA2 = transformers_version >= Version("4.42")
if SUPPORTS_GEMMA:
from .gemma import FastGemmaModel
if SUPPORTS_GEMMA2:
from .gemma2 import FastGemma2Model
del major, minor
pass
def _get_model_name(model_name, load_in_4bit = True):
@ -134,7 +134,7 @@ class FastLanguageModel(FastLlamaModel):
elif model_type == "mistral": dispatch_model = FastMistralModel
elif model_type == "gemma":
if not SUPPORTS_GEMMA:
raise RuntimeError(
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma.\n"\
f"The minimum required version is 4.38.\n"\
f'Try `pip install --upgrade "transformers>=4.38"`\n'\
@ -143,10 +143,10 @@ class FastLanguageModel(FastLlamaModel):
dispatch_model = FastGemmaModel
elif model_type == "gemma2":
if not SUPPORTS_GEMMA2:
raise RuntimeError(
raise ImportError(
f"Unsloth: Your transformers version of {transformers_version} does not support Gemma2.\n"\
f"The minimum required version is 4.43.\n"\
f'Try `pip install --upgrade "transformers>=4.43"`\n'\
f"The minimum required version is 4.42.3.\n"\
f'Try `pip install --upgrade "transformers>=4.42.3"`\n'\
f"to obtain the latest transformers build, then restart this session."\
)
dispatch_model = FastGemma2Model

View file

@ -910,12 +910,14 @@ pass
def check_nvidia():
# Unsloth doesn't work yet on AMD devices - we're working on it!
output = np.array([0,])
try:
output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
output = np.array([int(x.decode('utf-8'))/1024 for x in output])
except:
raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
output = np.array([int(x.decode('utf-8'))/1024 for x in output])
if not torch.cuda.is_available():
raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
return output
pass
PRE_CHECK = check_nvidia()
@ -972,12 +974,14 @@ def patch_sft_trainer_tokenizer():
" )\n"\
"pass\n"\
"import subprocess, re, gc, numpy as np\n"\
"a = np.array([0,])\n"\
"try:\n"\
" a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
" a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
" a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
"except:\n"\
" raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
"a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)\n"\
"a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"\
" if not torch.cuda.is_available():\n"\
" raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
"if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
" raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
"for _ in range(3):\n"\