Revert quantization methods

This commit is contained in:
Daniel Han-Chen 2024-01-19 23:15:20 +11:00
parent b3fcea6421
commit 0a52390ac2

View file

@ -106,7 +106,7 @@ def unsloth_save_model(
model,
tokenizer,
save_directory : Union[str, os.PathLike],
merge_method : str = "lora", # ["lora", "16bit", "4bit"]
save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -131,7 +131,7 @@ def unsloth_save_model(
maximum_memory_usage : float = 0.9,
):
save_pretrained_settings = dict(locals())
for deletion in ("model", "tokenizer", "merge_method", "temporary_location", "maximum_memory_usage"):
for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
del save_pretrained_settings[deletion]
pass
import re
@ -144,8 +144,8 @@ def unsloth_save_model(
gc.collect()
pass
merge_method = merge_method.lower().replace(" ", "_")
if merge_method != "lora" and merge_method != "16bit" and merge_method != "4bit":
save_method = save_method.lower().replace(" ", "_")
if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
raise RuntimeError(
"Unsloth: You must select one of 3 options when saving models:\n"\
'"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
@ -154,7 +154,7 @@ def unsloth_save_model(
)
pass
if merge_method == "4bit":
if save_method == "merged_4bit":
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
print("This might take 5 minutes...")
model = model.merge_and_unload()
@ -169,7 +169,7 @@ def unsloth_save_model(
pass
save_pretrained_settings["tags"] = tags
if (merge_method == "lora") and push_to_hub:
if (save_method == "lora") and push_to_hub:
if token is None:
raise RuntimeError(
"Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
@ -222,7 +222,7 @@ def unsloth_save_model(
save_directory = new_save_directory
pass
if (merge_method == "4bit") or (merge_method == "lora") or (
if (save_method == "merged_4bit") or (save_method == "lora") or (
not hasattr(model, "model") or \
not hasattr(model.model, "model") or \
not hasattr(model.model.model, "layers")
@ -246,7 +246,7 @@ def unsloth_save_model(
print()
print("Unsloth: Saving model...", end = "")
if merge_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
model.save_pretrained(**save_pretrained_settings)
print(" Done.")
@ -435,17 +435,17 @@ pass
def save_to_gguf(
model_directory : str = "unsloth_finetuned_model",
quantization : str = "fast_quantized",
quantization_method : str = "fast_quantized",
_run_installer = None, # Non blocking install of llama.cpp
):
from transformers.models.llama.modeling_llama import logger
if quantization == "not_quantized": quantization = "f16"
elif quantization == "fast_quantized": quantization = "q8_0"
elif quantization == "quantized": quantization = "q4_k_m"
elif quantization is None: quantization = "q8_0"
if quantization_method == "not_quantized": quantization_method = "f16"
elif quantization_method == "fast_quantized": quantization_method = "q8_0"
elif quantization_method == "quantized": quantization_method = "q4_k_m"
elif quantization_method is None: quantization_method = "q8_0"
if quantization not in ALLOWED_QUANTS.keys():
if quantization_method not in ALLOWED_QUANTS.keys():
error = f"Unsloth: Quant method = [{quantization}] not supported. Choose from below:\n"
for key, value in ALLOWED_QUANTS.items():
error += f"[{key}] => {value}\n"
@ -469,9 +469,9 @@ def save_to_gguf(
print("Unsloth: [1] Converting HF into GGUF format. This will take 3 minutes...")
first_conversion = "f16"
if quantization == "f32": first_conversion = "f32"
elif quantization == "f16": first_conversion = "f16"
elif quantization == "q8_0": first_conversion = "q8_0"
if quantization_method == "f32": first_conversion = "f32"
elif quantization_method == "f16": first_conversion = "f16"
elif quantization_method == "q8_0": first_conversion = "q8_0"
n_cpus = psutil.cpu_count()*2
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
@ -489,7 +489,7 @@ def save_to_gguf(
print(f"Unsloth: Conversion completed! Output location: {final_location}")
if quantization != first_conversion:
if quantization_method != first_conversion:
old_location = final_location
print(f"Unsloth: [2] Converting GGUF 16bit into {quantization}. This will take 20 minutes...")
final_location = f"./{model_directory}-unsloth.{quantization.upper()}.gguf"
@ -512,7 +512,7 @@ def unsloth_save_pretrained_merged(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -530,7 +530,7 @@ def unsloth_save_pretrained_merged(
Same as .save_pretrained(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `merge_method` to be either:
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
@ -555,7 +555,7 @@ def unsloth_push_to_hub_merged(
self,
repo_id : str,
tokenizer = None,
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = None,
private : Optional[bool] = None,
@ -573,7 +573,7 @@ def unsloth_push_to_hub_merged(
Same as .push_to_hub(...) except 4bit weights are auto
converted to float16 with as few overhead as possible.
Choose for `merge_method` to be either:
Choose for `save_method` to be either:
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
@ -601,7 +601,7 @@ def unsloth_save_pretrained_gguf(
self,
save_directory : Union[str, os.PathLike],
tokenizer = None,
quantization : str = "fast_quantized",
quantization_method : str = "fast_quantized",
push_to_hub : bool = False,
token : Optional[Union[str, bool]] = None,
is_main_process : bool = True,
@ -647,7 +647,7 @@ def unsloth_save_pretrained_gguf(
arguments["model"] = self
arguments["tokenizer"] = tokenizer
arguments["push_to_hub"] = False # We save ourselves
arguments["merge_method"] = "16bit" # Must be 16bit
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["quantization"]
@ -699,7 +699,7 @@ def unsloth_push_to_hub_gguf(
self,
repo_id : str,
tokenizer = None,
quantization : str = "fast_quantized",
quantization_method : str = "fast_quantized",
use_temp_dir : Optional[bool] = None,
commit_message : Optional[str] = None,
private : Optional[bool] = None,
@ -746,7 +746,7 @@ def unsloth_push_to_hub_gguf(
arguments["tokenizer"] = tokenizer
arguments["save_directory"] = repo_id
arguments["push_to_hub"] = False # We save ourselves
arguments["merge_method"] = "16bit" # Must be 16bit
arguments["save_method"] = "merged_16bit" # Must be 16bit
del arguments["self"]
del arguments["repo_id"]
del arguments["quantization"]