Revert quantization methods

2026-04-21 13:37:39 +00:00 · 2024-01-19 23:15:20 +11:00 · 2024-01-19 23:15:20 +11:00 · 0a52390ac2
commit 0a52390ac2
parent b3fcea6421
1 changed files with 26 additions and 26 deletions
--- a/unsloth/save.py
+++ b/unsloth/save.py
@ -106,7 +106,7 @@ def unsloth_save_model(
    model,
    tokenizer,
    save_directory       : Union[str, os.PathLike],
-    merge_method         : str = "lora", # ["lora", "16bit", "4bit"]
+    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -131,7 +131,7 @@ def unsloth_save_model(
    maximum_memory_usage : float = 0.9,
 ):
    save_pretrained_settings = dict(locals())
-    for deletion in ("model", "tokenizer", "merge_method", "temporary_location", "maximum_memory_usage"):
+    for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
        del save_pretrained_settings[deletion]
    pass
    import re
@ -144,8 +144,8 @@ def unsloth_save_model(
        gc.collect()
    pass

-    merge_method = merge_method.lower().replace(" ", "_")
-    if merge_method != "lora" and merge_method != "16bit" and merge_method != "4bit":
+    save_method = save_method.lower().replace(" ", "_")
+    if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
        raise RuntimeError(
            "Unsloth: You must select one of 3 options when saving models:\n"\
            '"lora"         ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
@ -154,7 +154,7 @@ def unsloth_save_model(
        )
    pass

-    if merge_method == "4bit":
+    if save_method == "merged_4bit":
        print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
        print("This might take 5 minutes...")
        model = model.merge_and_unload()
@ -169,7 +169,7 @@ def unsloth_save_model(
    pass
    save_pretrained_settings["tags"] = tags

-    if (merge_method == "lora") and push_to_hub:
+    if (save_method == "lora") and push_to_hub:
        if token is None:
            raise RuntimeError(
                "Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
@ -222,7 +222,7 @@ def unsloth_save_model(
        save_directory = new_save_directory
    pass
    
-    if (merge_method == "4bit") or (merge_method == "lora") or (
+    if (save_method == "merged_4bit") or (save_method == "lora") or (
        not hasattr(model, "model") or \
        not hasattr(model.model, "model") or \
        not hasattr(model.model.model, "layers")
@ -246,7 +246,7 @@ def unsloth_save_model(
            print()

        print("Unsloth: Saving model...", end = "")
-        if merge_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
+        if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")

        model.save_pretrained(**save_pretrained_settings)
        print(" Done.")
@ -435,17 +435,17 @@ pass

 def save_to_gguf(
    model_directory : str = "unsloth_finetuned_model",
-    quantization    : str = "fast_quantized",
+    quantization_method    : str = "fast_quantized",
    _run_installer = None, # Non blocking install of llama.cpp
 ):
    from transformers.models.llama.modeling_llama import logger

-    if   quantization == "not_quantized":  quantization = "f16"
-    elif quantization == "fast_quantized": quantization = "q8_0"
-    elif quantization == "quantized":      quantization = "q4_k_m"
-    elif quantization is None:             quantization = "q8_0"
+    if   quantization_method == "not_quantized":  quantization_method = "f16"
+    elif quantization_method == "fast_quantized": quantization_method = "q8_0"
+    elif quantization_method == "quantized":      quantization_method = "q4_k_m"
+    elif quantization_method is None:             quantization_method = "q8_0"

-    if quantization not in ALLOWED_QUANTS.keys():
+    if quantization_method not in ALLOWED_QUANTS.keys():
        error = f"Unsloth: Quant method = [{quantization}] not supported. Choose from below:\n"
        for key, value in ALLOWED_QUANTS.items():
            error += f"[{key}] => {value}\n"
@ -469,9 +469,9 @@ def save_to_gguf(

    print("Unsloth: [1] Converting HF into GGUF format. This will take 3 minutes...")
    first_conversion = "f16"
-    if   quantization == "f32":  first_conversion = "f32"
-    elif quantization == "f16":  first_conversion = "f16"
-    elif quantization == "q8_0": first_conversion = "q8_0"
+    if   quantization_method == "f32":  first_conversion = "f32"
+    elif quantization_method == "f16":  first_conversion = "f16"
+    elif quantization_method == "q8_0": first_conversion = "q8_0"

    n_cpus = psutil.cpu_count()*2
    # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
@ -489,7 +489,7 @@ def save_to_gguf(

    print(f"Unsloth: Conversion completed! Output location: {final_location}")

-    if quantization != first_conversion:
+    if quantization_method != first_conversion:
        old_location = final_location
        print(f"Unsloth: [2] Converting GGUF 16bit into {quantization}. This will take 20 minutes...")
        final_location = f"./{model_directory}-unsloth.{quantization.upper()}.gguf"
@ -512,7 +512,7 @@ def unsloth_save_pretrained_merged(
    self,
    save_directory       : Union[str, os.PathLike],
    tokenizer            = None,
-    merge_method         : str = "16bit", # ["lora", "16bit", "4bit"]
+    save_method         : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -530,7 +530,7 @@ def unsloth_save_pretrained_merged(
        Same as .save_pretrained(...) except 4bit weights are auto
        converted to float16 with as few overhead as possible.

-        Choose for `merge_method` to be either:
+        Choose for `save_method` to be either:
        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
@ -555,7 +555,7 @@ def unsloth_push_to_hub_merged(
    self,
    repo_id              : str,
    tokenizer            = None,
-    merge_method         : str = "16bit", # ["lora", "16bit", "4bit"]
+    save_method         : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
    use_temp_dir         : Optional[bool] = None,
    commit_message       : Optional[str] = None,
    private              : Optional[bool] = None,
@ -573,7 +573,7 @@ def unsloth_push_to_hub_merged(
        Same as .push_to_hub(...) except 4bit weights are auto
        converted to float16 with as few overhead as possible.

-        Choose for `merge_method` to be either:
+        Choose for `save_method` to be either:
        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
@ -601,7 +601,7 @@ def unsloth_save_pretrained_gguf(
    self,
    save_directory       : Union[str, os.PathLike],
    tokenizer            = None,
-    quantization         : str = "fast_quantized",
+    quantization_method         : str = "fast_quantized",
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -647,7 +647,7 @@ def unsloth_save_pretrained_gguf(
    arguments["model"]        = self
    arguments["tokenizer"]    = tokenizer
    arguments["push_to_hub"]  = False # We save ourselves
-    arguments["merge_method"] = "16bit" # Must be 16bit
+    arguments["save_method"] = "merged_16bit" # Must be 16bit
    del arguments["self"]
    del arguments["quantization"]

@ -699,7 +699,7 @@ def unsloth_push_to_hub_gguf(
    self,
    repo_id              : str,
    tokenizer            = None,
-    quantization         : str = "fast_quantized",
+    quantization_method         : str = "fast_quantized",
    use_temp_dir         : Optional[bool] = None,
    commit_message       : Optional[str] = None,
    private              : Optional[bool] = None,
@ -746,7 +746,7 @@ def unsloth_push_to_hub_gguf(
    arguments["tokenizer"]      = tokenizer
    arguments["save_directory"] = repo_id
    arguments["push_to_hub"]    = False # We save ourselves
-    arguments["merge_method"]   = "16bit" # Must be 16bit
+    arguments["save_method"]   = "merged_16bit" # Must be 16bit
    del arguments["self"]
    del arguments["repo_id"]
    del arguments["quantization"]