mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Fix more bugs (#232)
* Update gemma.py * position_ids * Update gemma.py * Update gemma.py * pos * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * revert * revert * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * rope * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * llama * Update llama.py * gemma * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update save.py * RoPE * Update llama.py * Update llama.py * Update llama.py * Update gemma.py * correct_dtype * Update gemma.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Chat Templates * Update README.md * Update README.md * Update llama.py * DoRA * Update _utils.py * Update chat_templates.py * Update llama.py * Hotfix - fix DoRA, Gemma prompt template (#202) (#203) * Update save.py * saving * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update __init__.py * Update save.py * Update save.py * Update save.py * save * trainer * spaces * original * Gemma * Update pyproject.toml * Update mapper.py * Update fast_lora.py * FastGemmaModel * model_type * Update llama.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update fast_lora.py * Update llama.py * Update llama.py * Update cross_entropy_loss.py * Update llama.py * Update llama.py * gemma * Update llama.py * Update llama.py * Update llama.py * Update llama.py * Update fast_lora.py * Update fast_lora.py * Fast CE Loss * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * CE * Update llama.py * Update llama.py * Update cross_entropy_loss.py * Update geglu.py * Update cross_entropy_loss.py * revert * Update llama.py * Update llama.py * norm * Update gemma.py * Update gemma.py * position_ids * Update gemma.py * Update gemma.py * pos * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * revert * revert * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update llama.py * Update llama.py * Update llama.py * Update llama.py * rope * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * llama * Update llama.py * gemma * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update save.py * RoPE * Update llama.py * Update llama.py * Update llama.py * Update gemma.py * correct_dtype * Update gemma.py * Update cross_entropy_loss.py * Update cross_entropy_loss.py * Chat Templates * Update README.md * Update README.md * Update llama.py * DoRA * Update _utils.py * Update chat_templates.py * Update pyproject.toml * Small fixes * Update pyproject.toml * Approx gelu * Update geglu.py * Approx gelu * Update llama.py * Update __init__.py * Update __init__.py * Update _utils.py * Update geglu.py * Update gemma.py * Update rms_layernorm.py * Update rms_layernorm.py * Update rms_layernorm.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Update gemma.py * Fix Gemma merging * Update rms_layernorm.py * Update gemma.py * Update pyproject.toml * Layernorms * Gemma precision * Update gemma.py * sqrt * Update gemma.py * Update save.py * RoPE and Gemma precision * Update rms_layernorm.py * Fix warning * Update chat_templates.py * Update chat_templates.py * Update save.py * Update save.py * Update save.py * Update chat_templates.py * Update llama.py * model_name * Update loader.py * Tokenizer overwritten * Update llama.py * Update llama.py * Update llama.py * Update save.py * Accuracy * Revert * Update save.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update fast_lora.py * Update chat_templates.py * Update save.py * Update save.py * Update llama.py * Update llama.py * Account for DoRA * Update llama.py
This commit is contained in:
parent
8bea94c137
commit
32223779c4
4 changed files with 114 additions and 25 deletions
|
|
@ -253,13 +253,15 @@ def get_chat_template(
|
|||
mapping = {"role" : "role", "content" : "content", "user" : "user", "assistant" : "assistant"},
|
||||
map_eos_token = True,
|
||||
):
|
||||
old_tokenizer = tokenizer
|
||||
|
||||
if map_eos_token is False:
|
||||
assert("Unsloth: Can only map new tokens to EOS for now. Adding new tokens is not yet supported.")
|
||||
pass
|
||||
|
||||
# if tokenizer.__class__.__name__.startswith("Gemma") and chat_template == "chatml":
|
||||
# chat_template = "gemma_chatml"
|
||||
# pass
|
||||
if tokenizer.__class__.__name__.startswith("Gemma") and chat_template == "chatml":
|
||||
chat_template = "gemma_chatml"
|
||||
pass
|
||||
|
||||
old_padding_side = tokenizer.padding_side
|
||||
|
||||
|
|
@ -340,6 +342,17 @@ def get_chat_template(
|
|||
tokenizer.padding_side = old_padding_side
|
||||
tokenizer.chat_template = chat_template
|
||||
|
||||
# Also fix up other tokens
|
||||
old_pad_token = getattr(old_tokenizer, "pad_token", None)
|
||||
old_bos_token = getattr(old_tokenizer, "bos_token", None)
|
||||
old_unk_token = getattr(old_tokenizer, "unk_token", None)
|
||||
new_pad_token = getattr(tokenizer, "pad_token", None)
|
||||
new_bos_token = getattr(tokenizer, "bos_token", None)
|
||||
new_unk_token = getattr(tokenizer, "unk_token", None)
|
||||
if old_pad_token != new_pad_token: tokenizer.pad_token = old_pad_token
|
||||
if old_bos_token != new_bos_token: tokenizer.bos_token = old_bos_token
|
||||
if old_unk_token != new_unk_token: tokenizer.unk_token = old_unk_token
|
||||
|
||||
#stopping_criteria = create_stopping_criteria(tokenizer, stop_word)
|
||||
|
||||
return tokenizer#, stopping_criteria
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ def prepare_model_for_kbit_training(
|
|||
|
||||
# Freeze all parameters except LoRA
|
||||
for name, param in model.named_parameters():
|
||||
if ".lora_A." in name or ".lora_B." in name:
|
||||
if ".lora_A." in name or ".lora_B." in name or ".lora_magnitude_vector" in name:
|
||||
param.requires_grad_(True)
|
||||
else:
|
||||
param.requires_grad_(False)
|
||||
|
|
|
|||
|
|
@ -511,26 +511,36 @@ def LlamaModel_fast_forward(
|
|||
|
||||
# Mormalized from Gemma
|
||||
IS_GEMMA = self.config.model_type == "gemma"
|
||||
train_embed_tokens = self.embed_tokens.weight.requires_grad
|
||||
|
||||
if IS_GEMMA:
|
||||
inputs_requires_grad = inputs_embeds.requires_grad
|
||||
if not inputs_embeds.is_leaf:
|
||||
inputs_embeds = inputs_embeds.detach()
|
||||
inputs_requires_grad = True
|
||||
elif inputs_requires_grad:
|
||||
inputs_embeds.requires_grad_(False)
|
||||
pass
|
||||
# Match Gemma exactly by casting to bfloat16 / float16
|
||||
# inputs_embeds *= math_sqrt(self.config.hidden_size)
|
||||
# Ie 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
|
||||
# & 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
|
||||
inputs_embeds *= torch.tensor(math_sqrt(self.config.hidden_size), dtype = inputs_embeds.dtype)
|
||||
# inputs_embeds *= math_sqrt(self.config.hidden_size)
|
||||
if inputs_requires_grad: inputs_embeds.requires_grad_(True)
|
||||
normalizer = torch.tensor(math_sqrt(self.config.hidden_size), dtype = inputs_embeds.dtype)
|
||||
|
||||
if train_embed_tokens:
|
||||
# Careful we must not do an inplace op!
|
||||
inputs_embeds = inputs_embeds * normalizer
|
||||
else:
|
||||
inputs_requires_grad = inputs_embeds.requires_grad
|
||||
if not inputs_embeds.is_leaf:
|
||||
inputs_embeds = inputs_embeds.detach()
|
||||
inputs_requires_grad = True
|
||||
elif inputs_requires_grad:
|
||||
inputs_embeds.requires_grad_(False)
|
||||
pass
|
||||
inputs_embeds *= normalizer
|
||||
# inputs_embeds *= math_sqrt(self.config.hidden_size)
|
||||
if inputs_requires_grad: inputs_embeds.requires_grad_(True)
|
||||
pass
|
||||
pass
|
||||
|
||||
# Fix up attention mask by setting elements to 0
|
||||
# Specifically for DPO
|
||||
if self._has_no_labels and (attention_mask is not None) and (past_key_values is None):
|
||||
if self._has_no_labels and (attention_mask is not None) and (past_key_values is None) and \
|
||||
(not train_embed_tokens):
|
||||
# Careful for inference the attention_mask is size (1, kv_seq_len)
|
||||
# Whilst the input_embeds is size (1, 1, 4096)
|
||||
inputs_requires_grad = inputs_embeds.requires_grad
|
||||
|
|
@ -1226,6 +1236,7 @@ class FastLlamaModel:
|
|||
random_state = 3407,
|
||||
max_seq_length = 2048, # not used anymore
|
||||
use_rslora = False,
|
||||
modules_to_save = None,
|
||||
init_lora_weights = True,
|
||||
loftq_config = {},
|
||||
**kwargs,
|
||||
|
|
@ -1312,15 +1323,45 @@ class FastLlamaModel:
|
|||
accepted_modules = frozenset(("q_proj", "k_proj", "v_proj", "o_proj",
|
||||
"gate_proj", "up_proj", "down_proj",),)
|
||||
model.config.update({"unsloth_version" : __version__})
|
||||
|
||||
train_lm_head = False
|
||||
train_embed_tokens = False
|
||||
final_modules = []
|
||||
for module in target_modules:
|
||||
assert(module in accepted_modules)
|
||||
if module == "lm_head":
|
||||
logger.warning_once(
|
||||
"Unsloth: `lm_head` should be placed in `modules_to_save` and not `target_modules`."\
|
||||
"We shall do it for you!"
|
||||
)
|
||||
train_lm_head = True
|
||||
|
||||
elif module == "embed_tokens":
|
||||
logger.warning_once(
|
||||
"Unsloth: `embed_tokens` should be placed in `modules_to_save` and not `target_modules`."\
|
||||
"We shall do it for you!"
|
||||
)
|
||||
train_embed_tokens = True
|
||||
|
||||
else:
|
||||
assert(module in accepted_modules)
|
||||
final_modules.append(module)
|
||||
pass
|
||||
|
||||
# Check modules_to_save
|
||||
if modules_to_save is not None:
|
||||
for module in modules_to_save:
|
||||
if module == "lm_head":
|
||||
train_lm_head = True
|
||||
elif module == "embed_tokens":
|
||||
train_embed_tokens = True
|
||||
pass
|
||||
pass
|
||||
|
||||
# Get LoRA
|
||||
arguments = dict(
|
||||
r = r,
|
||||
lora_alpha = lora_alpha,
|
||||
target_modules = target_modules,
|
||||
target_modules = final_modules,
|
||||
lora_dropout = lora_dropout,
|
||||
bias = bias,
|
||||
task_type = TaskType.CAUSAL_LM,
|
||||
|
|
@ -1328,6 +1369,7 @@ class FastLlamaModel:
|
|||
init_lora_weights = init_lora_weights,
|
||||
loftq_config = loftq_config,
|
||||
use_rslora = use_rslora,
|
||||
modules_to_save = modules_to_save,
|
||||
**kwargs,
|
||||
)
|
||||
if not SUPPORTS_LOFTQ: del arguments["loftq_config"]
|
||||
|
|
@ -1337,6 +1379,14 @@ class FastLlamaModel:
|
|||
model = _get_peft_model(model, lora_config)
|
||||
|
||||
model = FastLlamaModel.patch_peft_model(model, use_gradient_checkpointing)
|
||||
|
||||
# Now patch lm_head and embed_tokens
|
||||
if train_embed_tokens:
|
||||
model.model.model.embed_tokens.requires_grad_(True)
|
||||
if train_lm_head:
|
||||
model.model.lm_head.requires_grad_(True)
|
||||
pass
|
||||
|
||||
return model
|
||||
pass
|
||||
|
||||
|
|
@ -1427,9 +1477,12 @@ class FastLlamaModel:
|
|||
if hasattr(gate_proj, "lora_A") and \
|
||||
hasattr( up_proj, "lora_A") and \
|
||||
hasattr(down_proj, "lora_A") and \
|
||||
(gate_proj.base_layer if hasattr(gate_proj, "base_layer") else gate_proj).bias is None and \
|
||||
( up_proj.base_layer if hasattr( up_proj, "base_layer") else up_proj).bias is None and \
|
||||
(down_proj.base_layer if hasattr(down_proj, "base_layer") else down_proj).bias is None:
|
||||
((gate_proj.base_layer if hasattr(gate_proj, "base_layer") else gate_proj).bias is None) and \
|
||||
(( up_proj.base_layer if hasattr( up_proj, "base_layer") else up_proj).bias is None) and \
|
||||
((down_proj.base_layer if hasattr(down_proj, "base_layer") else down_proj).bias is None) and \
|
||||
((gate_proj.lora_magnitude_vector if hasattr(gate_proj, "lora_magnitude_vector") else None) is None) and \
|
||||
(( up_proj.lora_magnitude_vector if hasattr( up_proj, "lora_magnitude_vector") else None) is None) and \
|
||||
((down_proj.lora_magnitude_vector if hasattr(down_proj, "lora_magnitude_vector") else None) is None):
|
||||
|
||||
# https://stackoverflow.com/questions/50599045/python-replacing-a-function-within-a-class-of-a-module
|
||||
layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
|
||||
|
|
@ -1448,9 +1501,12 @@ class FastLlamaModel:
|
|||
if hasattr(q_proj, "lora_A") and \
|
||||
hasattr(k_proj, "lora_A") and \
|
||||
hasattr(v_proj, "lora_A") and \
|
||||
(q_proj.base_layer if hasattr(q_proj, "base_layer") else q_proj).bias is None and \
|
||||
(k_proj.base_layer if hasattr(k_proj, "base_layer") else k_proj).bias is None and \
|
||||
(v_proj.base_layer if hasattr(v_proj, "base_layer") else v_proj).bias is None:
|
||||
((q_proj.base_layer if hasattr(q_proj, "base_layer") else q_proj).bias is None) and \
|
||||
((k_proj.base_layer if hasattr(k_proj, "base_layer") else k_proj).bias is None) and \
|
||||
((v_proj.base_layer if hasattr(v_proj, "base_layer") else v_proj).bias is None) and \
|
||||
((q_proj.lora_magnitude_vector if hasattr(q_proj, "lora_magnitude_vector") else None) is None) and \
|
||||
((k_proj.lora_magnitude_vector if hasattr(k_proj, "lora_magnitude_vector") else None) is None) and \
|
||||
((v_proj.lora_magnitude_vector if hasattr(v_proj, "lora_magnitude_vector") else None) is None):
|
||||
|
||||
layer.self_attn.apply_qkv = apply_lora_qkv
|
||||
n_qkv += 1
|
||||
|
|
@ -1464,7 +1520,8 @@ class FastLlamaModel:
|
|||
# O attention patching
|
||||
o_proj = layer.self_attn.o_proj
|
||||
if hasattr(o_proj, "lora_A") and \
|
||||
(o_proj.base_layer if hasattr(o_proj, "base_layer") else o_proj).bias is None:
|
||||
((o_proj.base_layer if hasattr(o_proj, "base_layer") else o_proj).bias is None) and \
|
||||
((o_proj.lora_magnitude_vector if hasattr(o_proj, "lora_magnitude_vector") else None) is None):
|
||||
|
||||
layer.self_attn.apply_o = apply_lora_o
|
||||
n_o += 1
|
||||
|
|
|
|||
|
|
@ -203,7 +203,11 @@ def unsloth_save_model(
|
|||
|
||||
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
|
||||
print("This might take 5 minutes...")
|
||||
model = model.merge_and_unload()
|
||||
|
||||
# Counteract no LoRA adapters!
|
||||
if hasattr(model, "merge_and_unload"):
|
||||
model = model.merge_and_unload()
|
||||
pass
|
||||
print("Done.")
|
||||
pass
|
||||
|
||||
|
|
@ -573,6 +577,21 @@ def install_llama_cpp_old(version = -10):
|
|||
latest = releases[-1]
|
||||
version = releases[version].split(" ")[0]
|
||||
|
||||
# Check if the llama.cpp exists
|
||||
if os.path.exists("llama.cpp"):
|
||||
print(
|
||||
"**[WARNING]** You have a llama.cpp old directory which is broken.\n"\
|
||||
"Unsloth will DELETE the broken directory and install a new one.\n"\
|
||||
"Press CTRL + C / cancel this if this is wrong. We shall wait 10 seconds.\n"
|
||||
)
|
||||
import time
|
||||
for i in range(10):
|
||||
print(f"**[WARNING]** Deleting llama.cpp directory... {10-i} seconds left.")
|
||||
time.sleep(1)
|
||||
import shutil
|
||||
shutil.rmtree("llama.cpp")
|
||||
pass
|
||||
|
||||
# Clone a specific commit
|
||||
commands = [
|
||||
"git clone https://github.com/ggerganov/llama.cpp",
|
||||
|
|
|
|||
Loading…
Reference in a new issue