mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Revert quantization methods
This commit is contained in:
parent
b3fcea6421
commit
0a52390ac2
1 changed files with 26 additions and 26 deletions
|
|
@ -106,7 +106,7 @@ def unsloth_save_model(
|
|||
model,
|
||||
tokenizer,
|
||||
save_directory : Union[str, os.PathLike],
|
||||
merge_method : str = "lora", # ["lora", "16bit", "4bit"]
|
||||
save_method : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
|
||||
push_to_hub : bool = False,
|
||||
token : Optional[Union[str, bool]] = None,
|
||||
is_main_process : bool = True,
|
||||
|
|
@ -131,7 +131,7 @@ def unsloth_save_model(
|
|||
maximum_memory_usage : float = 0.9,
|
||||
):
|
||||
save_pretrained_settings = dict(locals())
|
||||
for deletion in ("model", "tokenizer", "merge_method", "temporary_location", "maximum_memory_usage"):
|
||||
for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
|
||||
del save_pretrained_settings[deletion]
|
||||
pass
|
||||
import re
|
||||
|
|
@ -144,8 +144,8 @@ def unsloth_save_model(
|
|||
gc.collect()
|
||||
pass
|
||||
|
||||
merge_method = merge_method.lower().replace(" ", "_")
|
||||
if merge_method != "lora" and merge_method != "16bit" and merge_method != "4bit":
|
||||
save_method = save_method.lower().replace(" ", "_")
|
||||
if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
|
||||
raise RuntimeError(
|
||||
"Unsloth: You must select one of 3 options when saving models:\n"\
|
||||
'"lora" ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
|
||||
|
|
@ -154,7 +154,7 @@ def unsloth_save_model(
|
|||
)
|
||||
pass
|
||||
|
||||
if merge_method == "4bit":
|
||||
if save_method == "merged_4bit":
|
||||
print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
|
||||
print("This might take 5 minutes...")
|
||||
model = model.merge_and_unload()
|
||||
|
|
@ -169,7 +169,7 @@ def unsloth_save_model(
|
|||
pass
|
||||
save_pretrained_settings["tags"] = tags
|
||||
|
||||
if (merge_method == "lora") and push_to_hub:
|
||||
if (save_method == "lora") and push_to_hub:
|
||||
if token is None:
|
||||
raise RuntimeError(
|
||||
"Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
|
||||
|
|
@ -222,7 +222,7 @@ def unsloth_save_model(
|
|||
save_directory = new_save_directory
|
||||
pass
|
||||
|
||||
if (merge_method == "4bit") or (merge_method == "lora") or (
|
||||
if (save_method == "merged_4bit") or (save_method == "lora") or (
|
||||
not hasattr(model, "model") or \
|
||||
not hasattr(model.model, "model") or \
|
||||
not hasattr(model.model.model, "layers")
|
||||
|
|
@ -246,7 +246,7 @@ def unsloth_save_model(
|
|||
print()
|
||||
|
||||
print("Unsloth: Saving model...", end = "")
|
||||
if merge_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
|
||||
if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
|
||||
|
||||
model.save_pretrained(**save_pretrained_settings)
|
||||
print(" Done.")
|
||||
|
|
@ -435,17 +435,17 @@ pass
|
|||
|
||||
def save_to_gguf(
|
||||
model_directory : str = "unsloth_finetuned_model",
|
||||
quantization : str = "fast_quantized",
|
||||
quantization_method : str = "fast_quantized",
|
||||
_run_installer = None, # Non blocking install of llama.cpp
|
||||
):
|
||||
from transformers.models.llama.modeling_llama import logger
|
||||
|
||||
if quantization == "not_quantized": quantization = "f16"
|
||||
elif quantization == "fast_quantized": quantization = "q8_0"
|
||||
elif quantization == "quantized": quantization = "q4_k_m"
|
||||
elif quantization is None: quantization = "q8_0"
|
||||
if quantization_method == "not_quantized": quantization_method = "f16"
|
||||
elif quantization_method == "fast_quantized": quantization_method = "q8_0"
|
||||
elif quantization_method == "quantized": quantization_method = "q4_k_m"
|
||||
elif quantization_method is None: quantization_method = "q8_0"
|
||||
|
||||
if quantization not in ALLOWED_QUANTS.keys():
|
||||
if quantization_method not in ALLOWED_QUANTS.keys():
|
||||
error = f"Unsloth: Quant method = [{quantization}] not supported. Choose from below:\n"
|
||||
for key, value in ALLOWED_QUANTS.items():
|
||||
error += f"[{key}] => {value}\n"
|
||||
|
|
@ -469,9 +469,9 @@ def save_to_gguf(
|
|||
|
||||
print("Unsloth: [1] Converting HF into GGUF format. This will take 3 minutes...")
|
||||
first_conversion = "f16"
|
||||
if quantization == "f32": first_conversion = "f32"
|
||||
elif quantization == "f16": first_conversion = "f16"
|
||||
elif quantization == "q8_0": first_conversion = "q8_0"
|
||||
if quantization_method == "f32": first_conversion = "f32"
|
||||
elif quantization_method == "f16": first_conversion = "f16"
|
||||
elif quantization_method == "q8_0": first_conversion = "q8_0"
|
||||
|
||||
n_cpus = psutil.cpu_count()*2
|
||||
# Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
|
||||
|
|
@ -489,7 +489,7 @@ def save_to_gguf(
|
|||
|
||||
print(f"Unsloth: Conversion completed! Output location: {final_location}")
|
||||
|
||||
if quantization != first_conversion:
|
||||
if quantization_method != first_conversion:
|
||||
old_location = final_location
|
||||
print(f"Unsloth: [2] Converting GGUF 16bit into {quantization}. This will take 20 minutes...")
|
||||
final_location = f"./{model_directory}-unsloth.{quantization.upper()}.gguf"
|
||||
|
|
@ -512,7 +512,7 @@ def unsloth_save_pretrained_merged(
|
|||
self,
|
||||
save_directory : Union[str, os.PathLike],
|
||||
tokenizer = None,
|
||||
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
|
||||
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
|
||||
push_to_hub : bool = False,
|
||||
token : Optional[Union[str, bool]] = None,
|
||||
is_main_process : bool = True,
|
||||
|
|
@ -530,7 +530,7 @@ def unsloth_save_pretrained_merged(
|
|||
Same as .save_pretrained(...) except 4bit weights are auto
|
||||
converted to float16 with as few overhead as possible.
|
||||
|
||||
Choose for `merge_method` to be either:
|
||||
Choose for `save_method` to be either:
|
||||
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
|
||||
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
|
||||
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
|
||||
|
|
@ -555,7 +555,7 @@ def unsloth_push_to_hub_merged(
|
|||
self,
|
||||
repo_id : str,
|
||||
tokenizer = None,
|
||||
merge_method : str = "16bit", # ["lora", "16bit", "4bit"]
|
||||
save_method : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
|
||||
use_temp_dir : Optional[bool] = None,
|
||||
commit_message : Optional[str] = None,
|
||||
private : Optional[bool] = None,
|
||||
|
|
@ -573,7 +573,7 @@ def unsloth_push_to_hub_merged(
|
|||
Same as .push_to_hub(...) except 4bit weights are auto
|
||||
converted to float16 with as few overhead as possible.
|
||||
|
||||
Choose for `merge_method` to be either:
|
||||
Choose for `save_method` to be either:
|
||||
1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
|
||||
2. `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
|
||||
3. `lora`: Save LoRA adapters with no merging. Useful for HF inference.
|
||||
|
|
@ -601,7 +601,7 @@ def unsloth_save_pretrained_gguf(
|
|||
self,
|
||||
save_directory : Union[str, os.PathLike],
|
||||
tokenizer = None,
|
||||
quantization : str = "fast_quantized",
|
||||
quantization_method : str = "fast_quantized",
|
||||
push_to_hub : bool = False,
|
||||
token : Optional[Union[str, bool]] = None,
|
||||
is_main_process : bool = True,
|
||||
|
|
@ -647,7 +647,7 @@ def unsloth_save_pretrained_gguf(
|
|||
arguments["model"] = self
|
||||
arguments["tokenizer"] = tokenizer
|
||||
arguments["push_to_hub"] = False # We save ourselves
|
||||
arguments["merge_method"] = "16bit" # Must be 16bit
|
||||
arguments["save_method"] = "merged_16bit" # Must be 16bit
|
||||
del arguments["self"]
|
||||
del arguments["quantization"]
|
||||
|
||||
|
|
@ -699,7 +699,7 @@ def unsloth_push_to_hub_gguf(
|
|||
self,
|
||||
repo_id : str,
|
||||
tokenizer = None,
|
||||
quantization : str = "fast_quantized",
|
||||
quantization_method : str = "fast_quantized",
|
||||
use_temp_dir : Optional[bool] = None,
|
||||
commit_message : Optional[str] = None,
|
||||
private : Optional[bool] = None,
|
||||
|
|
@ -746,7 +746,7 @@ def unsloth_push_to_hub_gguf(
|
|||
arguments["tokenizer"] = tokenizer
|
||||
arguments["save_directory"] = repo_id
|
||||
arguments["push_to_hub"] = False # We save ourselves
|
||||
arguments["merge_method"] = "16bit" # Must be 16bit
|
||||
arguments["save_method"] = "merged_16bit" # Must be 16bit
|
||||
del arguments["self"]
|
||||
del arguments["repo_id"]
|
||||
del arguments["quantization"]
|
||||
|
|
|
|||
Loading…
Reference in a new issue