More bug fixes (#133)

* faster saving & inference

* Update llama.py

* Update save.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update mistral.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* fast inference

* Update llama.py

* Update save.py

* Update llama.py

* Mistral correct RoPE scaling

* Max sequence lengths

* Apache 2

* fast_linear_forward

* Update utils.py

* Update utils.py

* No print

* Update utils.py

* Update utils.py

* inference

* Update llama.py

* Fast inference RoPE

* Update llama.py

* Update llama.py

* RoPE

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* Update llama.py

* LoRA

* Fast LoRA saving

* Update llama.py

* hidden_states

* q_len == 1

* q_len issue

* Update mistral.py

* Update mistral.py

* incorrect inference

* Update to transformers 4.37

* Graceful FA2 error + torch 2.1.1

* Update mapper.py

* Update pyproject.toml

* Fix saving and bnb-4bit

* Update fast_lora.py

* Update fast_lora.py

* remove patching

* Update llama.py

* Update llama.py

* Update swiglu.py

* Repatch

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update llama.py

* Update fast_lora.py

* Update llama.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update swiglu.py

* Update fast_lora.py

* Update swiglu.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update fast_lora.py

* Update save.py

* Update fast_lora.py

* Update utils.py

* Update llama.py

* Update fast_lora.py

* Update swiglu.py

* Update save.py

* Update save.py

* Update llama.py

* Update llama.py

* Update llama.py
This commit is contained in:
Daniel Han 2024-01-27 04:47:54 +11:00 committed by GitHub
parent 62fae3aa74
commit 7da0c50f75
3 changed files with 21 additions and 10 deletions

View file

@ -179,7 +179,10 @@ pass
def fast_linear_forward(proj, X, temp_lora = None, out = None):
W, W_quant, lora_A, lora_B, lora_S = get_lora_parameters(proj)
out = fast_gemv(X, W, W_quant, out = out)
if W_quant is None:
out = torch.matmul(X, W.t())
else:
out = fast_gemv(X, W, W_quant, out = out)
if lora_A is not None:
# Save LoRAs for inference to stop data movement costs

View file

@ -489,7 +489,7 @@ def LlamaModel_fast_forward(
# Ignore attention_mask
if attention_mask is None:
padding_mask = None
elif True:#self.training:
elif self.training:
attention_mask = None
padding_mask = None
else:

View file

@ -258,11 +258,19 @@ def unsloth_save_model(
"private" : save_pretrained_settings["private"],
"token" : save_pretrained_settings["token"],
}
# Check if PEFT Model or not - if yes, 3 levels. If not 2 levels.
from peft import PeftModelForCausalLM
if isinstance(model, PeftModelForCausalLM):
internal_model = model.model
else:
internal_model = model
pass
# Cannot be converted properly!
if (save_method == "merged_4bit") or (save_method == "lora") or (
not hasattr(model, "model") or \
not hasattr(model.model, "model") or \
not hasattr(model.model.model, "layers")
not hasattr(internal_model.model, "layers")
):
# Do general saving
@ -343,12 +351,12 @@ def unsloth_save_model(
# HF also uses a OrderedDict
from collections import OrderedDict
state_dict = OrderedDict()
state_dict["model.embed_tokens.weight"] = model.model.model.embed_tokens.weight.data
state_dict["model.embed_tokens.weight"] = internal_model.model.embed_tokens.weight.data
max_vram = int(torch.cuda.get_device_properties(0).total_memory * maximum_memory_usage)
from tqdm import tqdm as ProgressBar
for j, layer in enumerate(ProgressBar(model.model.model.layers)):
for j, layer in enumerate(ProgressBar(internal_model.model.layers)):
for item in LLAMA_WEIGHTS:
proj = eval(f"layer.{item}")
name = f"model.layers.{j}.{item}.weight"
@ -375,8 +383,8 @@ def unsloth_save_model(
pass
pass
state_dict["model.norm.weight"] = model.model.model.norm.weight.data
state_dict["lm_head.weight"] = model.model.lm_head.weight.data
state_dict["model.norm.weight"] = internal_model.model.norm.weight.data
state_dict["lm_head.weight"] = internal_model.lm_head.weight.data
# All tensors MUST be type torch.Tensor and not torch.nn.parameter.Parameter
for key, value in state_dict.items():
@ -418,7 +426,7 @@ def unsloth_save_model(
model.config = new_config
# Save!
model.model.save_pretrained(**save_pretrained_settings)
internal_model.save_pretrained(**save_pretrained_settings)
# Revert config back
original_model = model