Quick fixes (#101)

* Fix tokenizer, dropout, bias for LoRA

* Update loader.py

* Fix LoRA downcasting

* Update _utils.py

* Saving to GGUF

* fix

* colab_quantize_to_gguf

* move save modules

* save module

* Update __init__.py

* Update save.py

* Temp downgrade due to TRL issue

* Fix up bugs

* Faster saving + other changes

* Update llama.py

* Saving modules

* spelling

* Update llama.py

* Update save.py

* Update save.py

* Update loader.py

* Update llama.py

* patch saving

* Update save.py

* Update save.py

* Update save.py

* patch saving

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* original_model

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* saving to RAM leakage?

* Update save.py

* new_save_directory

* Update save.py

* Update save.py

* Update save.py

* Update save.py

* Update pyproject.toml

* Update pyproject.toml

* Update pyproject.toml

* Quick fixes

* Update llama.py

* Update llama.py

* Update dpo.py

* Update dpo.py

* Update llama.py

* Update save.py
This commit is contained in:
Daniel Han 2024-01-19 22:52:30 +11:00 committed by GitHub
parent d691516ab9
commit b3fcea6421
5 changed files with 167 additions and 108 deletions

View file

@ -65,8 +65,46 @@ def NotebookProgressCallback_on_log(self, args, state, control, logs=None, **kwa
pass
def NotebookTrainingTracker_write_line(self, values):
"""
Write the values in the inner table.
Args:
values (`Dict[str, float]`): The values to display.
"""
if self.inner_table is None:
self.inner_table = [list(values.keys()), list(values.values())]
else:
columns = self.inner_table[0]
print(columns)
for key in values.keys():
if key not in columns:
columns.append(key)
self.inner_table[0] = columns
if len(self.inner_table) > 1:
last_values = self.inner_table[-1]
first_column = self.inner_table[0][0]
if last_values[0] != values[first_column]:
# write new line
self.inner_table.append([values[c] if c in values else "No Log" for c in columns])
else:
# update last line
new_values = values
for c in columns:
if c not in new_values.keys():
new_values[c] = last_values[columns.index(c)]
self.inner_table[-1] = [new_values[c] for c in columns]
else:
# Edit for evaluation purposes
self.inner_table.append([values[c] if c in values else 0 for c in columns])
pass
pass
pass
def PatchDPOTrainer():
# Patch DPO notebook printing
# NotebookTrainingTracker.write_line = NotebookTrainingTracker_write_line
from transformers.trainer import DEFAULT_PROGRESS_CALLBACK
DEFAULT_PROGRESS_CALLBACK.on_train_begin = NotebookProgressCallback_on_train_begin
DEFAULT_PROGRESS_CALLBACK.on_log = NotebookProgressCallback_on_log

View file

@ -161,11 +161,12 @@ pass
def fast_rms_layernorm_inference(self, X):
old_dtype = X.dtype
X = X.to(torch.float32)
variance = X.square().mean(-1, keepdim = True)
variance += self.variance_epsilon
X *= variance.rsqrt_()
X = X.to(residual.dtype)
X = X.to(old_dtype)
X *= self.weight
return X
pass
@ -660,14 +661,15 @@ class FastLlamaModel:
@staticmethod
def from_pretrained(
model_name = "unsloth/llama-2-7b-bnb-4bit",
model_name = "unsloth/llama-2-7b-bnb-4bit",
max_seq_length = 4096,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
**kwargs,
):
SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
gpu_stats = torch.cuda.get_device_properties(0)
@ -720,18 +722,19 @@ class FastLlamaModel:
max_position_embeddings = max(max_seq_length, model_max_seq_length)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map = device_map,
torch_dtype = dtype,
quantization_config = bnb_config,
token = token,
rope_scaling = rope_scaling,
device_map = device_map,
torch_dtype = dtype,
quantization_config = bnb_config,
token = token,
rope_scaling = rope_scaling,
max_position_embeddings = max_position_embeddings,
**kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
model_max_length = max_seq_length,
padding_side = "right",
token = token,
padding_side = "right",
token = token,
)
model, tokenizer = patch_tokenizer(model, tokenizer)
@ -755,12 +758,12 @@ class FastLlamaModel:
# We check the tokenizer first for errors
if fix_tokenizer:
tokenizer = check_tokenizer(
model = model,
tokenizer = tokenizer,
model_name = model_name,
model = model,
tokenizer = tokenizer,
model_name = model_name,
model_max_length = max_seq_length,
padding_side = "right",
token = token,
padding_side = "right",
token = token,
)
pass
patch_saving_functions(tokenizer)
@ -828,20 +831,20 @@ class FastLlamaModel:
@staticmethod
def get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
layers_to_transform = None,
layers_pattern = None,
layers_pattern = None,
use_gradient_checkpointing = True,
random_state = 3407,
max_seq_length = 2048, # not used anymore
use_rslora = False,
init_lora_weights = True,
loftq_config = None,
random_state = 3407,
max_seq_length = 2048, # not used anymore
use_rslora = False,
init_lora_weights = True,
loftq_config = None,
**kwargs,
):
if isinstance(model, PeftModelForCausalLM):
@ -909,12 +912,14 @@ class FastLlamaModel:
assert(type(use_rslora) is bool)
if use_rslora:
if not SUPPORTS_RSLORA:
# We do it ourselves!
new_alpha = lora_alpha / (r**0.5)
import peft
raise RuntimeError(
f"Unsloth: Your PEFT version of {peft.__version__} does not support use_rslora.\n"\
"Please install PEFT 0.7.2 or higher.\n"\
"You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
logger.warning_once(
f"Unsloth: Your PEFT version of {peft.__version__} (0.7.2 needed) does not support `use_rslora` natively.\n"\
f"But, we do it ourselves by setting `alpha = {new_alpha}.`"
)
lora_alpha = new_alpha
pass
pass

View file

@ -63,14 +63,14 @@ pass
class FastLanguageModel(FastLlamaModel):
@staticmethod
def from_pretrained(
model_name = "unsloth/mistral-7b-bnb-4bit",
model_name = "unsloth/mistral-7b-bnb-4bit",
max_seq_length = 4096,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None,
fix_tokenizer = True,
*args, **kwargs,
):
old_model_name = model_name
@ -106,14 +106,14 @@ class FastLanguageModel(FastLlamaModel):
pass
model, tokenizer = dispatch_model.from_pretrained(
model_name = model_name,
model_name = model_name,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = token,
device_map = device_map,
rope_scaling = rope_scaling,
fix_tokenizer = fix_tokenizer,
*args, **kwargs,
)

View file

@ -256,14 +256,15 @@ class FastMistralModel(FastLlamaModel):
@staticmethod
def from_pretrained(
model_name = "unsloth/mistral-7b-bnb-4bit",
model_name = "unsloth/mistral-7b-bnb-4bit",
max_seq_length = 4096,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None, # Mistral does not support RoPE scaling
fix_tokenizer = True,
dtype = None,
load_in_4bit = True,
token = None,
device_map = "sequential",
rope_scaling = None, # Mistral does not support RoPE scaling
fix_tokenizer = True,
**kwargs,
):
if rope_scaling is not None:
logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
@ -305,6 +306,7 @@ class FastMistralModel(FastLlamaModel):
quantization_config = bnb_config,
token = token,
# rope_scaling = rope_scaling,
**kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,

View file

@ -94,7 +94,7 @@ def fast_save_pickle(shard, name):
torch.save(
shard,
name,
pickle_module = pickle,
pickle_module = pickle,
pickle_protocol = pickle.HIGHEST_PROTOCOL,
)
return
@ -106,7 +106,7 @@ def unsloth_save_model(
model,
tokenizer,
save_directory : Union[str, os.PathLike],
save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
merge_method : str = "lora", # ["lora", "16bit", "4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -131,7 +131,7 @@ def unsloth_save_model(
maximum_memory_usage : float = 0.9,
):
save_pretrained_settings = dict(locals())
for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
for deletion in ("model", "tokenizer", "merge_method", "temporary_location", "maximum_memory_usage"):
del save_pretrained_settings[deletion]
pass
import re
@ -144,8 +144,8 @@ def unsloth_save_model(
gc.collect()
pass
save_method = save_method.lower().replace(" ", "_")
if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
merge_method = merge_method.lower().replace(" ", "_")
if merge_method != "lora" and merge_method != "16bit" and merge_method != "4bit":
raise RuntimeError(
"Unsloth: You must select one of 3 options when saving models:\n"\
'"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
@ -154,7 +154,7 @@ def unsloth_save_model(
)
pass
if save_method == "merged_4bit":
if merge_method == "4bit":
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
print("This might take 5 minutes...")
model = model.merge_and_unload()
@ -169,7 +169,7 @@ def unsloth_save_model(
pass
save_pretrained_settings["tags"] = tags
if (save_method == "lora") and push_to_hub:
if (merge_method == "lora") and push_to_hub:
if token is None:
raise RuntimeError(
"Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
@ -222,7 +222,7 @@ def unsloth_save_model(
save_directory = new_save_directory
pass
if (save_method == "merged_4bit") or (save_method == "lora") or (
if (merge_method == "4bit") or (merge_method == "lora") or (
not hasattr(model, "model") or \
not hasattr(model.model, "model") or \
not hasattr(model.model.model, "layers")
@ -246,7 +246,7 @@ def unsloth_save_model(
print()
print("Unsloth: Saving model...", end = "")
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
if merge_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
model.save_pretrained(**save_pretrained_settings)
print(" Done.")
@ -434,19 +434,19 @@ pass
def save_to_gguf(
model_directory : str = "unsloth_finetuned_model",
quantization_method : str = "fast_quantized",
model_directory : str = "unsloth_finetuned_model",
quantization : str = "fast_quantized",
_run_installer = None, # Non blocking install of llama.cpp
):
from transformers.models.llama.modeling_llama import logger
if quantization_method == "not_quantized": quantization_method = "f16"
elif quantization_method == "fast_quantized": quantization_method = "q8_0"
elif quantization_method == "quantized": quantization_method = "q4_k_m"
elif quantization_method is None: quantization_method = "q8_0"
if quantization == "not_quantized": quantization = "f16"
elif quantization == "fast_quantized": quantization = "q8_0"
elif quantization == "quantized": quantization = "q4_k_m"
elif quantization is None: quantization = "q8_0"
if quantization_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
if quantization not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quantization}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
raise RuntimeError(error)
@ -456,7 +456,7 @@ def save_to_gguf(
f"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n"\
f" \\\ /| [0] Installing llama.cpp will take 3 minutes.\n"\
f"O^O/ \_/ \\ [1] Converting HF to GUUF 16bits will take 3 minutes.\n"\
f"\ / [2] Converting GGUF 16bits to {quantization_method} will take 20 minutes.\n"\
f"\ / [2] Converting GGUF 16bits to {quantization} will take 20 minutes.\n"\
f' "-____-" In total, you will have to wait around 26 minutes.\n'
print(print_info)
@ -469,9 +469,9 @@ def save_to_gguf(
print("Unsloth: [1] Converting HF into GGUF format. This will take 3 minutes...")
first_conversion = "f16"
if quantization_method == "f32": first_conversion = "f32"
elif quantization_method == "f16": first_conversion = "f16"
elif quantization_method == "q8_0": first_conversion = "q8_0"
if quantization == "f32": first_conversion = "f32"
elif quantization == "f16": first_conversion = "f16"
elif quantization == "q8_0": first_conversion = "q8_0"
n_cpus = psutil.cpu_count()*2
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
@ -489,13 +489,13 @@ def save_to_gguf(
print(f"Unsloth: Conversion completed! Output location: {final_location}")
if quantization_method != first_conversion:
if quantization != first_conversion:
old_location = final_location
print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
print(f"Unsloth: [2] Converting GGUF 16bit into {quantization}. This will take 20 minutes...")
final_location = f"./{model_directory}-unsloth.{quantization.upper()}.gguf"
command = f"./llama.cpp/quantize {old_location} "\
f"{final_location} {quantization_method} {n_cpus}"
f"{final_location} {quantization} {n_cpus}"
with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
for line in sp.stdout:
@ -511,7 +511,8 @@ pass
def unsloth_save_pretrained_merged(
self,
save_directory : Union[str, os.PathLike],
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
tokenizer = None,
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -529,14 +530,20 @@ def unsloth_save_pretrained_merged(
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `merged_16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `merged_4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
Choose for `merge_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.save_pretrained(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = None
arguments["model"] = self
del arguments["self"]
unsloth_save_model(**arguments)
for _ in range(3):
@ -547,7 +554,8 @@ pass
def unsloth_push_to_hub_merged(
self,
repo_id : str,
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
tokenizer = None,
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = None,
private : Optional[bool] = None,
@ -565,14 +573,20 @@ def unsloth_push_to_hub_merged(
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `save_method` to be either:
1. `merged_16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `merged_4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
Choose for `merge_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
"""
if tokenizer is None:
logger.warning_once(
"Unsloth: You're not saving a tokenizer as well?\n"\
"You can do it separately via `tokenizer.push_to_hub(...)`"
)
pass
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = None
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = True
del arguments["self"]
@ -587,7 +601,7 @@ def unsloth_save_pretrained_gguf(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
quantization_method : str = "fast_quantized",
quantization : str = "fast_quantized",
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -605,7 +619,7 @@ def unsloth_save_pretrained_gguf(
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
Choose for `quantization` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
@ -630,12 +644,12 @@ def unsloth_save_pretrained_gguf(
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
arguments = dict(locals())
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["push_to_hub"] = False # We save ourselves
arguments["merge_method"] = "16bit" # Must be 16bit
del arguments["self"]
del arguments["quantization_method"]
del arguments["quantization"]
# Non blocking install GGUF first
git_clone = install_llama_cpp_clone_non_blocking()
@ -648,7 +662,7 @@ def unsloth_save_pretrained_gguf(
for _ in range(3):
gc.collect()
file_location = save_to_gguf(new_save_directory, quantization_method, makefile)
file_location = save_to_gguf(new_save_directory, quantization, makefile)
# And save to HF
if push_to_hub:
@ -685,7 +699,7 @@ def unsloth_push_to_hub_gguf(
self,
repo_id : str,
tokenizer = None,
quantization_method : str = "fast_quantized",
quantization : str = "fast_quantized",
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = None,
private : Optional[bool] = None,
@ -703,7 +717,7 @@ def unsloth_push_to_hub_gguf(
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 then converted to GGUF / llama.cpp format.
Choose for `quantization_method` to be:
Choose for `quantization` to be:
"not_quantized" : "Recommended. Fast conversion. Slow inference, big files.",
"fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
"quantized" : "Recommended. Slow conversion. Fast inference, small files.",
@ -732,10 +746,10 @@ def unsloth_push_to_hub_gguf(
arguments["tokenizer"] = tokenizer
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = False # We save ourselves
arguments["save_method"] = "merged_16bit" # Must be 16bit
arguments["merge_method"] = "16bit" # Must be 16bit
del arguments["self"]
del arguments["repo_id"]
del arguments["quantization_method"]
del arguments["quantization"]
# Non blocking install GGUF first
git_clone = install_llama_cpp_clone_non_blocking()
@ -748,7 +762,7 @@ def unsloth_push_to_hub_gguf(
gc.collect()
python_install.wait()
file_location = save_to_gguf(new_save_directory, quantization_method, makefile)
file_location = save_to_gguf(new_save_directory, quantization, makefile)
# Save to hub
print("Unsloth: Uploading GGUF to Huggingface Hub...")