Small fixes (#48)

* Fix generation for GQA

* Update _utils.py

* flash attn

* Update _utils.py

* Update llama.py

* Update mistral.py

* platform

* Update _utils.py

* Update llama.py

* Logo changed

* Update README.md

* Update README.md
This commit is contained in:
Daniel Han 2023-12-23 04:22:48 +11:00 committed by GitHub
parent 37365b6ba9
commit ef70177a24
7 changed files with 87 additions and 56 deletions

View file

@ -33,7 +33,7 @@ If you trained a model with Unsloth, we made a cool sticker!!
# Installation Instructions - Conda
Unsloth currently only supports Linux distros and Pytorch == 2.1.
```
```bash
conda install cudatoolkit xformers bitsandbytes pytorch pytorch-cuda=12.1 \
-c pytorch -c nvidia -c xformers -c conda-forge -y
pip install "unsloth[kaggle] @ git+https://github.com/unslothai/unsloth.git"
@ -41,16 +41,16 @@ pip install "unsloth[kaggle] @ git+https://github.com/unslothai/unsloth.git"
# Installation Instructions - Pip
1. Find your CUDA version via
```
```python
import torch; torch.version.cuda
```
2. We only support Pytorch 2.1 (2.1.1 bugs out for now): You can update Pytorch via Pip (interchange cu121 / cu118)
```
```bash
pip install --upgrade --force-reinstall --no-cache-dir torch==2.1.0 triton \
--index-url https://download.pytorch.org/whl/cu121
```
2. Select either cu118 for CUDA 11.8 or cu121 for CUDA 12.1. If you have a RTX 3060 or higher (A100, H100 etc), use the "ampere" path.
```
```bash
pip install "unsloth[cu118] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu118_ampere] @ git+https://github.com/unslothai/unsloth.git"
@ -59,13 +59,13 @@ pip install "unsloth[cu121_ampere] @ git+https://github.com/unslothai/unsloth.gi
Change `cu121` to `cu118` for CUDA version 11.8 or 12.1. Go to https://pytorch.org/ to learn more.
4. If you get errors, try the below first, then go back to step 1:
```
```bash
pip install --upgrade pip
```
# Documentation
We support Huggingface's TRL, Trainer, Seq2SeqTrainer or even Pytorch code!
```
```python
from unsloth import FastLlamaModel, FastMistralModel
import torch
max_seq_length = 2048 # Can change to any number <= 4096
@ -305,7 +305,7 @@ $$
# Troubleshooting
1. Sometimes `bitsandbytes` or `xformers` does not link properly. Try running:
```
```bash
!ldconfig /usr/lib64-nvidia
```
2. Windows is not supported as of yet - we rely on Xformers and Triton support, so until both packages support Windows officially, Unsloth will then support Windows.
@ -315,5 +315,5 @@ $$
# Credits
1. [RandomInternetPreson](https://github.com/RandomInternetPreson) for confirming WSL support
2. [152334H](https://github.com/152334H) for experimental DPO support
3. [atgctg](https://github.com/atgctg) for syntax highlighting
<img src="./images/unsloth loading page render.png" width="300" />

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

After

Width:  |  Height:  |  Size: 59 KiB

View file

@ -20,13 +20,36 @@ import gc
warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch")
import bitsandbytes as bnb
from transformers.models.llama.modeling_llama import logger
import platform
from platform import system as platform_system
platform_system = platform_system()
__version__ = "2023.12"
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
try:
from flash_attn import flash_attn_func
HAS_FLASH_ATTENTION = True
except:
HAS_FLASH_ATTENTION = False
else:
# Tri Dao's benchmark shows xformers is faster for now.
HAS_FLASH_ATTENTION = False
pass
import xformers.ops.fmha as xformers
xformers_attention = xformers.memory_efficient_attention
from xformers import __version__ as xformers_version
__all__ = [
"prepare_model_for_kbit_training",
"patch_tokenizer",
"print_unsloth_message",
"xformers",
"xformers_attention",
"xformers_version",
"__version__",
"HAS_FLASH_ATTENTION",
"platform_system",
]
@ -71,6 +94,7 @@ pass
def patch_tokenizer(model, tokenizer):
model.config.update({"unsloth_version" : __version__})
if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
# Fixes https://github.com/unslothai/unsloth/issues/5
if hasattr(tokenizer, "unk_token"):
@ -88,18 +112,3 @@ def patch_tokenizer(model, tokenizer):
pass
return model, tokenizer
pass
def print_unsloth_message(name):
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
statistics = \
f"==((====))== Unsloth: Fast {name} patching release {__version__}\n"\
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
f"O^O/ \_/ \\ CUDA compute capability = {gpu_stats.major}.{gpu_stats.minor}\n"\
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform.system()}\n'
print(statistics)
pass

View file

@ -23,21 +23,9 @@ from transformers.models.llama.modeling_llama import (
)
from ..kernels import *
from ._utils import *
# Get Flash Attention v2 if Ampere (RTX 30xx, A100)
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
try:
from flash_attn import flash_attn_func
HAS_FLASH_ATTENTION = True
except:
HAS_FLASH_ATTENTION = False
else:
# Tri Dao's benchmark shows xformers is faster for now.
HAS_FLASH_ATTENTION = False
pass
import xformers.ops.fmha as xformers
xformers_attention = xformers.memory_efficient_attention
from ._utils import __version__
if HAS_FLASH_ATTENTION:
from flash_attn import flash_attn_func
# Final patching code
from transformers.models.llama.modeling_llama import (
@ -139,19 +127,20 @@ def LlamaAttention_fast_forward_inference(
# V = repeat_kv(V, n_groups)
if n_groups != 1:
_, _, cached_len, _ = Kn.shape
Kn = Kn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Vn = Vn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Kn = Kn.reshape(bsz, n_heads, cached_len, head_dim)
Vn = Vn.reshape(bsz, n_heads, cached_len, head_dim)
pass
Knn = Kn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Vnn = Vn[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, cached_len, head_dim)
Knn = Knn.view(bsz, n_heads, cached_len, head_dim)
Vnn = Vnn.view(bsz, n_heads, cached_len, head_dim)
else:
Knn, Vnn = Kn, Vn
# Attention
A = torch.matmul(Qn, Kn.transpose(2, 3))
A = torch.matmul(Qn, Knn.transpose(2, 3))
A *= 1.0 / (self.head_dim**0.5)
A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(A.dtype)
A = torch.matmul(A, Vn)
A = torch.matmul(A, Vnn)
A = A.transpose(1, 2)
A = A.reshape(bsz, 1, self.hidden_size)
A = A.view(bsz, 1, self.hidden_size)
A = original_apply_o(self, A)
return A, (Kn, Vn)
pass
@ -359,13 +348,13 @@ def LlamaModel_fast_forward(
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
raise ValueError("Unsloth: You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
raise ValueError("Unsloth: You have to specify either decoder_input_ids or decoder_inputs_embeds")
seq_length_with_past = seq_length
past_key_values_length = 0
@ -419,7 +408,7 @@ def LlamaModel_fast_forward(
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
"Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`"
)
use_cache = False
pass
@ -614,7 +603,16 @@ class FastLlamaModel:
rope_scaling = None,
):
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
print_unsloth_message("Llama")
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
statistics = \
f"==((====))== Unsloth: Fast Llama patching release {__version__}\n"\
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
f"O^O/ \_/ \\ CUDA capability = {gpu_stats.major}.{gpu_stats.minor}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform_system}\n'
logger.warning_once(statistics)
FastLlamaModel.pre_patch()
if dtype is None:
@ -632,7 +630,7 @@ class FastLlamaModel:
if (rope_scaling is None) and (max_seq_length > model_max_seq_length):
rope_scaling = max_seq_length / model_max_seq_length
logger.warning_once(
f"Unsloth: {model_name} can only handle sequence lengths of of most "\
f"Unsloth: {model_name} can only handle sequence lengths of at most "\
f"{model_max_seq_length}.\nBut with kaiokendev's RoPE scaling of "\
f"{round(rope_scaling, 3)}, it can be magically be extended to "\
f"{max_seq_length}!"
@ -686,6 +684,7 @@ class FastLlamaModel:
# Torch.compile fails on embedding matrix??
# Workaround randomnly fixes it for torch versions < 2.2
model.model.embed_tokens = torch.nn.Embedding.from_pretrained(model.model.embed_tokens.weight)
model.config.update({"unsloth_version" : __version__})
# We also do this for the lm_head
lm_head = torch.nn.Linear(1, 1, bias = None)
@ -747,6 +746,7 @@ class FastLlamaModel:
accepted_modules = frozenset(("q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",),)
model.config.update({"unsloth_version" : __version__})
for module in target_modules:
assert(module in accepted_modules)
pass
@ -771,6 +771,9 @@ class FastLlamaModel:
model = _get_peft_model(model, lora_config)
# Do patching
n_mlp = 0
n_qkv = 0
n_o = 0
for idx, layer in enumerate(model.model.model.layers):
# MLP patching
@ -780,6 +783,7 @@ class FastLlamaModel:
# https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
n_mlp += 1
pass
# QKV attention patching
@ -788,15 +792,22 @@ class FastLlamaModel:
hasattr(layer.self_attn.v_proj, "lora_A"):
layer.self_attn.apply_qkv = apply_lora_qkv
n_qkv += 1
pass
# O attention patching
if hasattr(layer.self_attn.o_proj, "lora_A"):
layer.self_attn.apply_o = apply_lora_o
n_o += 1
pass
pass
logger.warning_once(
f"Unsloth {__version__} patched {len(model.model.model.layers)} layers with "\
f"{n_qkv} QKV layers, {n_o} O layers and {n_mlp} MLP layers.",
)
# Patch cross entropy loss labels
# Fixes https://github.com/unslothai/unsloth/issues/10
extra_ignored_labels = torch.full((max_seq_length, 1), -100, device = "cuda")

View file

@ -45,7 +45,7 @@ class FastLanguageModel:
)
elif model_type == "mistral":
if rope_scaling is not None:
logger.warning_once("Mistral models do not support RoPE scaling.")
logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
return FastMistralModel.from_pretrained(
model_name = model_name,
max_seq_length = max_seq_length,
@ -57,7 +57,8 @@ class FastLanguageModel:
)
else:
raise NotImplementedError(
f"{model_name} not supported yet! Make an issue to https://github.com/unslothai/unsloth!",
f"Unsloth: {model_name} not supported yet!\n"\
"Make an issue to https://github.com/unslothai/unsloth!",
)
pass
pass

View file

@ -13,6 +13,7 @@
# limitations under the License.
from .llama import *
from ._utils import __version__
from transformers.models.mistral.modeling_mistral import (
MistralAttention,
@ -245,7 +246,16 @@ class FastMistralModel(FastLlamaModel):
# rope_scaling = None, Mistral does not support RoPE scaling
):
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
print_unsloth_message("Mistral")
gpu_stats = torch.cuda.get_device_properties(0)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
statistics = \
f"==((====))== Unsloth: Fast Mistral patching release {__version__}\n"\
f" \\\ /| GPU: {gpu_stats.name}. Max memory: {max_memory} GB\n"\
f"O^O/ \_/ \\ CUDA capability = {gpu_stats.major}.{gpu_stats.minor}. Xformers = {xformers_version}. FA = {HAS_FLASH_ATTENTION}.\n"\
f"\ / Pytorch version: {torch.__version__}. CUDA Toolkit = {torch.version.cuda}\n"\
f' "-____-" bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. Platform = {platform_system}\n'
logger.warning_once(statistics)
FastMistralModel.pre_patch()
if dtype is None: