mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Small fixes (#48)
* Fix generation for GQA * Update _utils.py * flash attn * Update _utils.py * Update llama.py * Update mistral.py * platform * Update _utils.py * Update llama.py * Logo changed * Update README.md * Update README.md
This commit is contained in:
parent
37365b6ba9
commit
ef70177a24
7 changed files with 87 additions and 56 deletions
16
README.md
16
README.md
|
|
@ -33,7 +33,7 @@ If you trained a model with Unsloth, we made a cool sticker!!
|
|||
|
||||
# Installation Instructions - Conda
|
||||
Unsloth currently only supports Linux distros and Pytorch == 2.1.
|
||||
```
|
||||
```bash
|
||||
conda install cudatoolkit xformers bitsandbytes pytorch pytorch-cuda=12.1 \
|
||||
-c pytorch -c nvidia -c xformers -c conda-forge -y
|
||||
pip install "unsloth[kaggle] @ git+https://github.com/unslothai/unsloth.git"
|
||||
|
|
@ -41,16 +41,16 @@ pip install "unsloth[kaggle] @ git+https://github.com/unslothai/unsloth.git"
|
|||
|
||||
# Installation Instructions - Pip
|
||||
1. Find your CUDA version via
|
||||
```
|
||||
```python
|
||||
import torch; torch.version.cuda
|
||||
```
|
||||
2. We only support Pytorch 2.1 (2.1.1 bugs out for now): You can update Pytorch via Pip (interchange cu121 / cu118)
|
||||
```
|
||||
```bash
|
||||
pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \
|
||||
--index-url https://download.pytorch.org/whl/cu121
|
||||
```
|
||||
2. Select either cu118 for CUDA 11.8 or cu121 for CUDA 12.1. If you have a RTX 3060 or higher (A100, H100 etc), use the "ampere" path.
|
||||
```
|
||||
```bash
|
||||
pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git"
|
||||
pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
|
||||
pip install "unsloth[cu118_ampere] @ git+https://github.com/unslothai/unsloth.git"
|
||||
|
|
@ -59,13 +59,13 @@ pip install "unsloth[cu121_ampere] @ git+https://github.com/unslothai/unsloth.gi
|
|||
Change `cu121` to `cu118` for CUDA version 11.8 or 12.1. Go to https://pytorch.org/ to learn more.
|
||||
|
||||
4. If you get errors, try the below first, then go back to step 1:
|
||||
```
|
||||
```bash
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
# Documentation
|
||||
We support Huggingface's TRL, Trainer, Seq2SeqTrainer or even Pytorch code!
|
||||
```
|
||||
```python
|
||||
from unsloth import FastLlamaModel, FastMistralModel
|
||||
import torch
|
||||
max_seq_length = 2048 # Can change to any number <= 4096
|
||||
|
|
@ -305,7 +305,7 @@ $$
|
|||
|
||||
# Troubleshooting
|
||||
1. Sometimes `bitsandbytes` or `xformers` does not link properly. Try running:
|
||||
```
|
||||
```bash
|
||||
!ldconfig /usr/lib64-nvidia
|
||||
```
|
||||
2. Windows is not supported as of yet - we rely on Xformers and Triton support, so until both packages support Windows officially, Unsloth will then support Windows.
|
||||
|
|
@ -315,5 +315,5 @@ $$
|
|||
# Credits
|
||||
1. [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support
|
||||
2. [152334H](https://github.com/152334H) for experimental DPO support
|
||||
|
||||
3. [atgctg](https://github.com/atgctg) for syntax highlighting
|
||||
<img src="./images/unsloth loading page render.png" width="300" />
|
||||
|
|
|
|||
Binary file not shown.
|
Before Width: | Height: | Size: 59 KiB After Width: | Height: | Size: 62 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 58 KiB After Width: | Height: | Size: 59 KiB |
|
|
@ -20,13 +20,36 @@ import gc
|
|||
warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch")
|
||||
import bitsandbytes as bnb
|
||||
from transformers.models.llama.modeling_llama import logger
|
||||
import platform
|
||||
from platform import system as platform_system
|
||||
platform_system = platform_system()
|
||||
|
||||
__version__ = "2023.12"
|
||||
|
||||
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
|
||||
major_version, minor_version = torch.cuda.get_device_capability()
|
||||
if major_version >= 8:
|
||||
try:
|
||||
from flash_attn import flash_attn_func
|
||||
HAS_FLASH_ATTENTION = True
|
||||
except:
|
||||
HAS_FLASH_ATTENTION = False
|
||||
else:
|
||||
# Tri Dao's benchmark shows xformers is faster for now.
|
||||
HAS_FLASH_ATTENTION = False
|
||||
pass
|
||||
import xformers.ops.fmha as xformers
|
||||
xformers_attention = xformers.memory_efficient_attention
|
||||
from xformers import __version__ as xformers_version
|
||||
|
||||
__all__ = [
|
||||
"prepare_model_for_kbit_training",
|
||||
"patch_tokenizer",
|
||||
"print_unsloth_message",
|
||||
"xformers",
|
||||
"xformers_attention",
|
||||
"xformers_version",
|
||||
"__version__",
|
||||
"HAS_FLASH_ATTENTION",
|
||||
"platform_system",
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -71,6 +94,7 @@ pass
|
|||
|
||||
|
||||
def patch_tokenizer(model, tokenizer):
|
||||
model.config.update({"unsloth_version" : __version__})
|
||||
if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
|
||||
# Fixes https://github.com/unslothai/unsloth/issues/5
|
||||
if hasattr(tokenizer, "unk_token"):
|
||||
|
|
@ -88,18 +112,3 @@ def patch_tokenizer(model, tokenizer):
|
|||
pass
|
||||
return model, tokenizer
|
||||
pass
|
||||
|
||||
|
||||
def print_unsloth_message(name):
|
||||
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
|
||||
gpu_stats = torch.cuda.get_device_properties(0)
|
||||
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
||||
|
||||
statistics = \
|
||||
f"==((====))== Unsloth: Fast {name} patching release {__version__}\n"\
|
||||
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
|
||||
f"O^O/ \_/ \\ CUDA compute capability = {gpu_stats.major}.{gpu_stats.minor}\n"\
|
||||
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
|
||||
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform.system()}\n'
|
||||
print(statistics)
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -23,21 +23,9 @@ from transformers.models.llama.modeling_llama import (
|
|||
)
|
||||
from ..kernels import *
|
||||
from ._utils import *
|
||||
|
||||
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
|
||||
major_version, minor_version = torch.cuda.get_device_capability()
|
||||
if major_version >= 8:
|
||||
try:
|
||||
from flash_attn import flash_attn_func
|
||||
HAS_FLASH_ATTENTION = True
|
||||
except:
|
||||
HAS_FLASH_ATTENTION = False
|
||||
else:
|
||||
# Tri Dao's benchmark shows xformers is faster for now.
|
||||
HAS_FLASH_ATTENTION = False
|
||||
pass
|
||||
import xformers.ops.fmha as xformers
|
||||
xformers_attention = xformers.memory_efficient_attention
|
||||
from ._utils import __version__
|
||||
if HAS_FLASH_ATTENTION:
|
||||
from flash_attn import flash_attn_func
|
||||
|
||||
# Final patching code
|
||||
from transformers.models.llama.modeling_llama import (
|
||||
|
|
@ -139,19 +127,20 @@ def LlamaAttention_fast_forward_inference(
|
|||
# V = repeat_kv(V, n_groups)
|
||||
if n_groups != 1:
|
||||
_, _, cached_len, _ = Kn.shape
|
||||
Kn = Kn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
|
||||
Vn = Vn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
|
||||
Kn = Kn.reshape(bsz, n_heads, cached_len, head_dim)
|
||||
Vn = Vn.reshape(bsz, n_heads, cached_len, head_dim)
|
||||
pass
|
||||
Knn = Kn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
|
||||
Vnn = Vn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
|
||||
Knn = Knn.view(bsz, n_heads, cached_len, head_dim)
|
||||
Vnn = Vnn.view(bsz, n_heads, cached_len, head_dim)
|
||||
else:
|
||||
Knn, Vnn = Kn, Vn
|
||||
|
||||
# Attention
|
||||
A = torch.matmul(Qn, Kn.transpose(2, 3))
|
||||
A = torch.matmul(Qn, Knn.transpose(2, 3))
|
||||
A *= 1.0 / (self.head_dim**0.5)
|
||||
A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(A.dtype)
|
||||
A = torch.matmul(A, Vn)
|
||||
A = torch.matmul(A, Vnn)
|
||||
A = A.transpose(1, 2)
|
||||
A = A.reshape(bsz, 1, self.hidden_size)
|
||||
A = A.view(bsz, 1, self.hidden_size)
|
||||
A = original_apply_o(self, A)
|
||||
return A, (Kn, Vn)
|
||||
pass
|
||||
|
|
@ -359,13 +348,13 @@ def LlamaModel_fast_forward(
|
|||
|
||||
# retrieve input_ids and inputs_embeds
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
|
||||
raise ValueError("Unsloth: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
elif inputs_embeds is not None:
|
||||
batch_size, seq_length, _ = inputs_embeds.shape
|
||||
else:
|
||||
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
||||
raise ValueError("Unsloth: You have to specify either decoder_input_ids or decoder_inputs_embeds")
|
||||
|
||||
seq_length_with_past = seq_length
|
||||
past_key_values_length = 0
|
||||
|
|
@ -419,7 +408,7 @@ def LlamaModel_fast_forward(
|
|||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
"Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`"
|
||||
)
|
||||
use_cache = False
|
||||
pass
|
||||
|
|
@ -614,7 +603,16 @@ class FastLlamaModel:
|
|||
rope_scaling = None,
|
||||
):
|
||||
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
|
||||
print_unsloth_message("Llama")
|
||||
gpu_stats = torch.cuda.get_device_properties(0)
|
||||
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
||||
|
||||
statistics = \
|
||||
f"==((====))== Unsloth: Fast Llama patching release {__version__}\n"\
|
||||
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
|
||||
f"O^O/ \_/ \\ CUDA capability = {gpu_stats.major}.{gpu_stats.minor}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
|
||||
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
|
||||
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform_system}\n'
|
||||
logger.warning_once(statistics)
|
||||
FastLlamaModel.pre_patch()
|
||||
|
||||
if dtype is None:
|
||||
|
|
@ -632,7 +630,7 @@ class FastLlamaModel:
|
|||
if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
|
||||
rope_scaling = max_seq_length / model_max_seq_length
|
||||
logger.warning_once(
|
||||
f"Unsloth: {model_name} can only handle sequence lengths of of most "\
|
||||
f"Unsloth: {model_name} can only handle sequence lengths of at most "\
|
||||
f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
|
||||
f"{round(rope_scaling, 3)}, it can be magically be extended to "\
|
||||
f"{max_seq_length}!"
|
||||
|
|
@ -686,6 +684,7 @@ class FastLlamaModel:
|
|||
# Torch.compile fails on embedding matrix??
|
||||
# Workaround randomnly fixes it for torch versions < 2.2
|
||||
model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
|
||||
model.config.update({"unsloth_version" : __version__})
|
||||
|
||||
# We also do this for the lm_head
|
||||
lm_head = torch.nn.Linear(1, 1, bias = None)
|
||||
|
|
@ -747,6 +746,7 @@ class FastLlamaModel:
|
|||
|
||||
accepted_modules = frozenset(("q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",),)
|
||||
model.config.update({"unsloth_version" : __version__})
|
||||
for module in target_modules:
|
||||
assert(module in accepted_modules)
|
||||
pass
|
||||
|
|
@ -771,6 +771,9 @@ class FastLlamaModel:
|
|||
model = _get_peft_model(model, lora_config)
|
||||
|
||||
# Do patching
|
||||
n_mlp = 0
|
||||
n_qkv = 0
|
||||
n_o = 0
|
||||
for idx, layer in enumerate(model.model.model.layers):
|
||||
|
||||
# MLP patching
|
||||
|
|
@ -780,6 +783,7 @@ class FastLlamaModel:
|
|||
|
||||
# https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
|
||||
layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
|
||||
n_mlp += 1
|
||||
pass
|
||||
|
||||
# QKV attention patching
|
||||
|
|
@ -788,15 +792,22 @@ class FastLlamaModel:
|
|||
hasattr(layer.self_attn.v_proj, "lora_A"):
|
||||
|
||||
layer.self_attn.apply_qkv = apply_lora_qkv
|
||||
n_qkv += 1
|
||||
pass
|
||||
|
||||
# O attention patching
|
||||
if hasattr(layer.self_attn.o_proj, "lora_A"):
|
||||
|
||||
layer.self_attn.apply_o = apply_lora_o
|
||||
n_o += 1
|
||||
pass
|
||||
pass
|
||||
|
||||
logger.warning_once(
|
||||
f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\
|
||||
f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.",
|
||||
)
|
||||
|
||||
# Patch cross entropy loss labels
|
||||
# Fixes https://github.com/unslothai/unsloth/issues/10
|
||||
extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda")
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ class FastLanguageModel:
|
|||
)
|
||||
elif model_type == "mistral":
|
||||
if rope_scaling is not None:
|
||||
logger.warning_once("Mistral models do not support RoPE scaling.")
|
||||
logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
|
||||
return FastMistralModel.from_pretrained(
|
||||
model_name = model_name,
|
||||
max_seq_length = max_seq_length,
|
||||
|
|
@ -57,7 +57,8 @@ class FastLanguageModel:
|
|||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"{model_name} not supported yet! Make an issue to https://github.com/unslothai/unsloth!",
|
||||
f"Unsloth: {model_name} not supported yet!\n"\
|
||||
"Make an issue to https://github.com/unslothai/unsloth!",
|
||||
)
|
||||
pass
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
from .llama import *
|
||||
from ._utils import __version__
|
||||
|
||||
from transformers.models.mistral.modeling_mistral import (
|
||||
MistralAttention,
|
||||
|
|
@ -245,7 +246,16 @@ class FastMistralModel(FastLlamaModel):
|
|||
# rope_scaling = None, Mistral does not support RoPE scaling
|
||||
):
|
||||
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
|
||||
print_unsloth_message("Mistral")
|
||||
gpu_stats = torch.cuda.get_device_properties(0)
|
||||
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
|
||||
|
||||
statistics = \
|
||||
f"==((====))== Unsloth: Fast Mistral patching release {__version__}\n"\
|
||||
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
|
||||
f"O^O/ \_/ \\ CUDA capability = {gpu_stats.major}.{gpu_stats.minor}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
|
||||
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
|
||||
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform_system}\n'
|
||||
logger.warning_once(statistics)
|
||||
FastMistralModel.pre_patch()
|
||||
|
||||
if dtype is None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue