Quick fixes (#101)

* Fix tokenizer, dropout, bias for LoRA * Update loader.py * Fix LoRA downcasting * Update _utils.py * Saving to GGUF * fix * colab_quantize_to_gguf * move save modules * save module * Update __init__.py * Update save.py * Temp downgrade due to TRL issue * Fix up bugs * Faster saving + other changes * Update llama.py * Saving modules * spelling * Update llama.py * Update save.py * Update save.py * Update loader.py * Update llama.py * patch saving * Update save.py * Update save.py * Update save.py * patch saving * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * original_model * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * Update save.py * saving to RAM leakage? * Update save.py * new_save_directory * Update save.py * Update save.py * Update save.py * Update save.py * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Quick fixes * Update llama.py * Update llama.py * Update dpo.py * Update dpo.py * Update llama.py * Update save.py
2026-04-21 13:37:39 +00:00 · 2024-01-19 22:52:30 +11:00 · 2024-01-19 22:52:30 +11:00 · b3fcea6421
commit b3fcea6421
parent d691516ab9
5 changed files with 167 additions and 108 deletions
--- a/unsloth/models/dpo.py
+++ b/unsloth/models/dpo.py
@ -65,8 +65,46 @@ def NotebookProgressCallback_on_log(self, args, state, control, logs=None, **kwa
 pass


+def NotebookTrainingTracker_write_line(self, values):
+    """
+    Write the values in the inner table.
+
+    Args:
+        values (`Dict[str, float]`): The values to display.
+    """
+    if self.inner_table is None:
+        self.inner_table = [list(values.keys()), list(values.values())]
+    else:
+        columns = self.inner_table[0]
+        print(columns)
+        for key in values.keys():
+            if key not in columns:
+                columns.append(key)
+        self.inner_table[0] = columns
+        if len(self.inner_table) > 1:
+            last_values = self.inner_table[-1]
+            first_column = self.inner_table[0][0]
+            if last_values[0] != values[first_column]:
+                # write new line
+                self.inner_table.append([values[c] if c in values else "No Log" for c in columns])
+            else:
+                # update last line
+                new_values = values
+                for c in columns:
+                    if c not in new_values.keys():
+                        new_values[c] = last_values[columns.index(c)]
+                self.inner_table[-1] = [new_values[c] for c in columns]
+        else:
+            # Edit for evaluation purposes
+            self.inner_table.append([values[c] if c in values else 0 for c in columns])
+        pass
+    pass
+pass
+
+
 def PatchDPOTrainer():
    # Patch DPO notebook printing
+    # NotebookTrainingTracker.write_line = NotebookTrainingTracker_write_line
    from transformers.trainer import DEFAULT_PROGRESS_CALLBACK
    DEFAULT_PROGRESS_CALLBACK.on_train_begin = NotebookProgressCallback_on_train_begin
    DEFAULT_PROGRESS_CALLBACK.on_log         = NotebookProgressCallback_on_log
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@ -161,11 +161,12 @@ pass


 def fast_rms_layernorm_inference(self, X):
+    old_dtype = X.dtype
    X = X.to(torch.float32)
    variance = X.square().mean(-1, keepdim = True)
    variance += self.variance_epsilon
    X *= variance.rsqrt_()
-    X = X.to(residual.dtype)
+    X = X.to(old_dtype)
    X *= self.weight
    return X
 pass
@ -660,14 +661,15 @@ class FastLlamaModel:

    @staticmethod
    def from_pretrained(
-        model_name = "unsloth/llama-2-7b-bnb-4bit",
+        model_name     = "unsloth/llama-2-7b-bnb-4bit",
        max_seq_length = 4096,
-        dtype = None,
-        load_in_4bit = True,
-        token = None,
-        device_map = "sequential",
-        rope_scaling = None,
-        fix_tokenizer = True,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None,
+        fix_tokenizer  = True,
+        **kwargs,
    ):
        SUPPORTS_BFLOAT16 = torch.cuda.is_bf16_supported()
        gpu_stats = torch.cuda.get_device_properties(0)
@ -720,18 +722,19 @@ class FastLlamaModel:
        max_position_embeddings = max(max_seq_length, model_max_seq_length)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
-            device_map = device_map,
-            torch_dtype = dtype,
-            quantization_config = bnb_config,
-            token = token,
-            rope_scaling = rope_scaling,
+            device_map              = device_map,
+            torch_dtype             = dtype,
+            quantization_config     = bnb_config,
+            token                   = token,
+            rope_scaling            = rope_scaling,
            max_position_embeddings = max_position_embeddings,
+            **kwargs,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            model_max_length = max_seq_length,
-            padding_side = "right",
-            token = token,
+            padding_side     = "right",
+            token            = token,
        )

        model, tokenizer = patch_tokenizer(model, tokenizer)
@ -755,12 +758,12 @@ class FastLlamaModel:
        # We check the tokenizer first for errors
        if fix_tokenizer:
            tokenizer = check_tokenizer(
-                model = model,
-                tokenizer = tokenizer,
-                model_name = model_name,
+                model            = model,
+                tokenizer        = tokenizer,
+                model_name       = model_name,
                model_max_length = max_seq_length,
-                padding_side = "right",
-                token = token,
+                padding_side     = "right",
+                token            = token,
            )
        pass
        patch_saving_functions(tokenizer)
@ -828,20 +831,20 @@ class FastLlamaModel:
    @staticmethod
    def get_peft_model(
        model,
-        r = 16,
-        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
-                          "gate_proj", "up_proj", "down_proj"],
-        lora_alpha = 16,
-        lora_dropout = 0,
-        bias = "none",
+        r                   = 16,
+        target_modules      = ["q_proj", "k_proj", "v_proj", "o_proj",
+                               "gate_proj", "up_proj", "down_proj"],
+        lora_alpha          = 16,
+        lora_dropout        = 0,
+        bias                = "none",
        layers_to_transform = None,
-        layers_pattern = None,
+        layers_pattern      = None,
        use_gradient_checkpointing = True,
-        random_state = 3407,
-        max_seq_length = 2048, # not used anymore
-        use_rslora = False,
-        init_lora_weights = True,
-        loftq_config = None,
+        random_state        = 3407,
+        max_seq_length      = 2048, # not used anymore
+        use_rslora          = False,
+        init_lora_weights   = True,
+        loftq_config        = None,
        **kwargs,
    ):
        if isinstance(model, PeftModelForCausalLM):
@ -909,12 +912,14 @@ class FastLlamaModel:
        assert(type(use_rslora) is bool)
        if use_rslora:
            if not SUPPORTS_RSLORA:
+                # We do it ourselves!
+                new_alpha = lora_alpha / (r**0.5)
                import peft
-                raise RuntimeError(
-                    f"Unsloth: Your PEFT version of {peft.__version__} does not support use_rslora.\n"\
-                    "Please install PEFT 0.7.2 or higher.\n"\
-                    "You can also install from source: `pip install git+https://github.com/huggingface/peft.git"
+                logger.warning_once(
+                    f"Unsloth: Your PEFT version of {peft.__version__} (0.7.2 needed) does not support `use_rslora` natively.\n"\
+                    f"But, we do it ourselves by setting `alpha = {new_alpha}.`"
                )
+                lora_alpha = new_alpha
            pass
        pass

--- a/unsloth/models/loader.py
+++ b/unsloth/models/loader.py
@ -63,14 +63,14 @@ pass
 class FastLanguageModel(FastLlamaModel):
    @staticmethod
    def from_pretrained(
-        model_name = "unsloth/mistral-7b-bnb-4bit",
+        model_name     = "unsloth/mistral-7b-bnb-4bit",
        max_seq_length = 4096,
-        dtype = None,
-        load_in_4bit = True,
-        token = None,
-        device_map = "sequential",
-        rope_scaling = None,
-        fix_tokenizer = True,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None,
+        fix_tokenizer  = True,
        *args, **kwargs,
    ):
        old_model_name = model_name
@ -106,14 +106,14 @@ class FastLanguageModel(FastLlamaModel):
        pass

        model, tokenizer = dispatch_model.from_pretrained(
-            model_name = model_name,
+            model_name     = model_name,
            max_seq_length = max_seq_length,
-            dtype = dtype,
-            load_in_4bit = load_in_4bit,
-            token = token,
-            device_map = device_map,
-            rope_scaling = rope_scaling,
-            fix_tokenizer = fix_tokenizer,
+            dtype          = dtype,
+            load_in_4bit   = load_in_4bit,
+            token          = token,
+            device_map     = device_map,
+            rope_scaling   = rope_scaling,
+            fix_tokenizer  = fix_tokenizer,
            *args, **kwargs,
        )

--- a/unsloth/models/mistral.py
+++ b/unsloth/models/mistral.py
@ -256,14 +256,15 @@ class FastMistralModel(FastLlamaModel):

    @staticmethod
    def from_pretrained(
-        model_name = "unsloth/mistral-7b-bnb-4bit",
+        model_name     = "unsloth/mistral-7b-bnb-4bit",
        max_seq_length = 4096,
-        dtype = None,
-        load_in_4bit = True,
-        token = None,
-        device_map = "sequential",
-        rope_scaling = None, # Mistral does not support RoPE scaling
-        fix_tokenizer = True,
+        dtype          = None,
+        load_in_4bit   = True,
+        token          = None,
+        device_map     = "sequential",
+        rope_scaling   = None, # Mistral does not support RoPE scaling
+        fix_tokenizer  = True,
+        **kwargs,
    ): 
        if rope_scaling is not None:
            logger.warning_once("Unsloth: Mistral models do not support RoPE scaling.")
@ -305,6 +306,7 @@ class FastMistralModel(FastLlamaModel):
            quantization_config = bnb_config,
            token = token,
            # rope_scaling = rope_scaling,
+            **kwargs,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
--- a/unsloth/save.py
+++ b/unsloth/save.py
@ -94,7 +94,7 @@ def fast_save_pickle(shard, name):
    torch.save(
        shard,
        name,
-        pickle_module = pickle,
+        pickle_module   = pickle,
        pickle_protocol = pickle.HIGHEST_PROTOCOL,
    )
    return
@ -106,7 +106,7 @@ def unsloth_save_model(
    model,
    tokenizer,
    save_directory       : Union[str, os.PathLike],
-    save_method          : str = "lora", # ["lora", "merged_16bit", "merged_4bit"]
+    merge_method         : str = "lora", # ["lora", "16bit", "4bit"]
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -131,7 +131,7 @@ def unsloth_save_model(
    maximum_memory_usage : float = 0.9,
 ):
    save_pretrained_settings = dict(locals())
-    for deletion in ("model", "tokenizer", "save_method", "temporary_location", "maximum_memory_usage"):
+    for deletion in ("model", "tokenizer", "merge_method", "temporary_location", "maximum_memory_usage"):
        del save_pretrained_settings[deletion]
    pass
    import re
@ -144,8 +144,8 @@ def unsloth_save_model(
        gc.collect()
    pass

-    save_method = save_method.lower().replace(" ", "_")
-    if save_method != "lora" and save_method != "merged_16bit" and save_method != "merged_4bit":
+    merge_method = merge_method.lower().replace(" ", "_")
+    if merge_method != "lora" and merge_method != "16bit" and merge_method != "4bit":
        raise RuntimeError(
            "Unsloth: You must select one of 3 options when saving models:\n"\
            '"lora"         ==> This is the fastest and easiet. Just saves LoRA modules.\n'\
@ -154,7 +154,7 @@ def unsloth_save_model(
        )
    pass

-    if save_method == "merged_4bit":
+    if merge_method == "4bit":
        print("Unsloth: Merging 4bit and LoRA weights to 4bit...")
        print("This might take 5 minutes...")
        model = model.merge_and_unload()
@ -169,7 +169,7 @@ def unsloth_save_model(
    pass
    save_pretrained_settings["tags"] = tags

-    if (save_method == "lora") and push_to_hub:
+    if (merge_method == "lora") and push_to_hub:
        if token is None:
            raise RuntimeError(
                "Unsloth: Pushing to HF requires a token. Pass `token = 'hf_....'`\n"\
@ -222,7 +222,7 @@ def unsloth_save_model(
        save_directory = new_save_directory
    pass
    
-    if (save_method == "merged_4bit") or (save_method == "lora") or (
+    if (merge_method == "4bit") or (merge_method == "lora") or (
        not hasattr(model, "model") or \
        not hasattr(model.model, "model") or \
        not hasattr(model.model.model, "layers")
@ -246,7 +246,7 @@ def unsloth_save_model(
            print()

        print("Unsloth: Saving model...", end = "")
-        if save_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")
+        if merge_method != "lora": print(" This might take 10 minutes for Llama-7b...", end = "")

        model.save_pretrained(**save_pretrained_settings)
        print(" Done.")
@ -434,19 +434,19 @@ pass


 def save_to_gguf(
-    model_directory     : str = "unsloth_finetuned_model",
-    quantization_method : str = "fast_quantized",
+    model_directory : str = "unsloth_finetuned_model",
+    quantization    : str = "fast_quantized",
    _run_installer = None, # Non blocking install of llama.cpp
 ):
    from transformers.models.llama.modeling_llama import logger

-    if   quantization_method == "not_quantized":  quantization_method = "f16"
-    elif quantization_method == "fast_quantized": quantization_method = "q8_0"
-    elif quantization_method == "quantized":      quantization_method = "q4_k_m"
-    elif quantization_method is None:             quantization_method = "q8_0"
+    if   quantization == "not_quantized":  quantization = "f16"
+    elif quantization == "fast_quantized": quantization = "q8_0"
+    elif quantization == "quantized":      quantization = "q4_k_m"
+    elif quantization is None:             quantization = "q8_0"

-    if quantization_method not in ALLOWED_QUANTS.keys():
-        error = f"Unsloth: Quant method = [{quantization_method}] not supported. Choose from below:\n"
+    if quantization not in ALLOWED_QUANTS.keys():
+        error = f"Unsloth: Quant method = [{quantization}] not supported. Choose from below:\n"
        for key, value in ALLOWED_QUANTS.items():
            error += f"[{key}] => {value}\n"
        raise RuntimeError(error)
@ -456,7 +456,7 @@ def save_to_gguf(
        f"==((====))==  Unsloth: Conversion from QLoRA to GGUF information\n"\
        f"   \\\   /|    [0] Installing llama.cpp will take 3 minutes.\n"\
        f"O^O/ \_/ \\    [1] Converting HF to GUUF 16bits will take 3 minutes.\n"\
-        f"\        /    [2] Converting GGUF 16bits to {quantization_method} will take 20 minutes.\n"\
+        f"\        /    [2] Converting GGUF 16bits to {quantization} will take 20 minutes.\n"\
        f' "-____-"     In total, you will have to wait around 26 minutes.\n'
    print(print_info)

@ -469,9 +469,9 @@ def save_to_gguf(

    print("Unsloth: [1] Converting HF into GGUF format. This will take 3 minutes...")
    first_conversion = "f16"
-    if   quantization_method == "f32":  first_conversion = "f32"
-    elif quantization_method == "f16":  first_conversion = "f16"
-    elif quantization_method == "q8_0": first_conversion = "q8_0"
+    if   quantization == "f32":  first_conversion = "f32"
+    elif quantization == "f16":  first_conversion = "f16"
+    elif quantization == "q8_0": first_conversion = "q8_0"

    n_cpus = psutil.cpu_count()*2
    # Concurrency from https://rentry.org/llama-cpp-conversions#merging-loras-into-a-model
@ -489,13 +489,13 @@ def save_to_gguf(

    print(f"Unsloth: Conversion completed! Output location: {final_location}")

-    if quantization_method != first_conversion:
+    if quantization != first_conversion:
        old_location = final_location
-        print(f"Unsloth: [2] Converting GGUF 16bit into {quantization_method}. This will take 20 minutes...")
-        final_location = f"./{model_directory}-unsloth.{quantization_method.upper()}.gguf"
+        print(f"Unsloth: [2] Converting GGUF 16bit into {quantization}. This will take 20 minutes...")
+        final_location = f"./{model_directory}-unsloth.{quantization.upper()}.gguf"

        command = f"./llama.cpp/quantize {old_location} "\
-            f"{final_location} {quantization_method} {n_cpus}"
+            f"{final_location} {quantization} {n_cpus}"
        
        with subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, bufsize = 1) as sp:
            for line in sp.stdout:
@ -511,7 +511,8 @@ pass
 def unsloth_save_pretrained_merged(
    self,
    save_directory       : Union[str, os.PathLike],
-    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    tokenizer            = None,
+    merge_method         : str = "16bit", # ["lora", "16bit", "4bit"]
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -529,14 +530,20 @@ def unsloth_save_pretrained_merged(
        Same as .save_pretrained(...) except 4bit weights are auto
        converted to float16 with as few overhead as possible.

-        Choose for `save_method` to be either:
-        1. `merged_16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
-        2.  `merged_4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
-        3.         `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+        Choose for `merge_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.save_pretrained(...)`"
+        )
+    pass
+
    arguments = dict(locals())
-    arguments["model"]     = self
-    arguments["tokenizer"] = None
+    arguments["model"] = self
    del arguments["self"]
    unsloth_save_model(**arguments)
    for _ in range(3):
@ -547,7 +554,8 @@ pass
 def unsloth_push_to_hub_merged(
    self,
    repo_id              : str,
-    save_method          : str = "merged_16bit", # ["lora", "merged_16bit", "merged_4bit"]
+    tokenizer            = None,
+    merge_method         : str = "16bit", # ["lora", "16bit", "4bit"]
    use_temp_dir         : Optional[bool] = None,
    commit_message       : Optional[str] = None,
    private              : Optional[bool] = None,
@ -565,14 +573,20 @@ def unsloth_push_to_hub_merged(
        Same as .push_to_hub(...) except 4bit weights are auto
        converted to float16 with as few overhead as possible.

-        Choose for `save_method` to be either:
-        1. `merged_16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
-        2.  `merged_4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
-        3.         `lora`: Save LoRA adapters with no merging. Useful for HF inference.
+        Choose for `merge_method` to be either:
+        1. `16bit`: Merge LoRA into float16 weights. Useful for GGUF / llama.cpp.
+        2.  `4bit`: Merge LoRA into int4 weights. Useful for DPO / HF inference.
+        3.  `lora`: Save LoRA adapters with no merging. Useful for HF inference.
    """
+    if tokenizer is None:
+        logger.warning_once(
+            "Unsloth: You're not saving a tokenizer as well?\n"\
+            "You can do it separately via `tokenizer.push_to_hub(...)`"
+        )
+    pass
+
    arguments = dict(locals())
    arguments["model"]          = self
-    arguments["tokenizer"]      = None
    arguments["save_directory"] = repo_id
    arguments["push_to_hub"]    = True
    del arguments["self"]
@ -587,7 +601,7 @@ def unsloth_save_pretrained_gguf(
    self,
    save_directory       : Union[str, os.PathLike],
    tokenizer            = None,
-    quantization_method  : str = "fast_quantized",
+    quantization         : str = "fast_quantized",
    push_to_hub          : bool = False,
    token                : Optional[Union[str, bool]] = None,
    is_main_process      : bool = True,
@ -605,7 +619,7 @@ def unsloth_save_pretrained_gguf(
        Same as .save_pretrained(...) except 4bit weights are auto
        converted to float16 then converted to GGUF / llama.cpp format.

-        Choose for `quantization_method` to be:
+        Choose for `quantization` to be:
        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
@ -630,12 +644,12 @@ def unsloth_save_pretrained_gguf(
        raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")

    arguments = dict(locals())
-    arguments["model"]       = self
-    arguments["tokenizer"]   = tokenizer
-    arguments["push_to_hub"] = False # We save ourselves
-    arguments["save_method"] = "merged_16bit" # Must be 16bit
+    arguments["model"]        = self
+    arguments["tokenizer"]    = tokenizer
+    arguments["push_to_hub"]  = False # We save ourselves
+    arguments["merge_method"] = "16bit" # Must be 16bit
    del arguments["self"]
-    del arguments["quantization_method"]
+    del arguments["quantization"]

    # Non blocking install GGUF first
    git_clone = install_llama_cpp_clone_non_blocking()
@ -648,7 +662,7 @@ def unsloth_save_pretrained_gguf(
    for _ in range(3):
        gc.collect()

-    file_location = save_to_gguf(new_save_directory, quantization_method, makefile)
+    file_location = save_to_gguf(new_save_directory, quantization, makefile)

    # And save to HF
    if push_to_hub:
@ -685,7 +699,7 @@ def unsloth_push_to_hub_gguf(
    self,
    repo_id              : str,
    tokenizer            = None,
-    quantization_method  : str = "fast_quantized",
+    quantization         : str = "fast_quantized",
    use_temp_dir         : Optional[bool] = None,
    commit_message       : Optional[str] = None,
    private              : Optional[bool] = None,
@ -703,7 +717,7 @@ def unsloth_push_to_hub_gguf(
        Same as .push_to_hub(...) except 4bit weights are auto
        converted to float16 then converted to GGUF / llama.cpp format.

-        Choose for `quantization_method` to be:
+        Choose for `quantization` to be:
        "not_quantized"  : "Recommended. Fast conversion. Slow inference, big files.",
        "fast_quantized" : "Recommended. Fast conversion. OK inference, OK file size.",
        "quantized"      : "Recommended. Slow conversion. Fast inference, small files.",
@ -732,10 +746,10 @@ def unsloth_push_to_hub_gguf(
    arguments["tokenizer"]      = tokenizer
    arguments["save_directory"] = repo_id
    arguments["push_to_hub"]    = False # We save ourselves
-    arguments["save_method"]    = "merged_16bit" # Must be 16bit
+    arguments["merge_method"]   = "16bit" # Must be 16bit
    del arguments["self"]
    del arguments["repo_id"]
-    del arguments["quantization_method"]
+    del arguments["quantization"]

    # Non blocking install GGUF first
    git_clone = install_llama_cpp_clone_non_blocking()
@ -748,7 +762,7 @@ def unsloth_push_to_hub_gguf(
        gc.collect()

    python_install.wait()
-    file_location = save_to_gguf(new_save_directory, quantization_method, makefile)
+    file_location = save_to_gguf(new_save_directory, quantization, makefile)

    # Save to hub
    print("Unsloth: Uploading GGUF to Huggingface Hub...")