@@ -460,6 +461,27 @@ export function ExportDialog({
)}
+ {/* Success banner for log-driven exports.
+ Keep users on the log screen after completion so they can
+ inspect conversion output before closing. */}
+ {exportSuccess && showLogPanel && (
+
+
+
+
+ {destination === "hub"
+ ? "Export finished and pushed to Hugging Face Hub."
+ : "Export finished successfully."}
+
+ {exportOutputPath ? (
+
+ {exportOutputPath}
+
+ ) : null}
+
+
+ )}
+
{/* Error banner */}
{exportError && (
@@ -577,14 +599,16 @@ export function ExportDialog({
onClick={() => onOpenChange(false)}
disabled={exporting}
>
- Cancel
+ {exportSuccess ? "Done" : "Cancel"}
-
+
{exporting ? (
Exporting…
+ ) : exportSuccess ? (
+ "Export Complete"
) : (
"Start Export"
)}
diff --git a/studio/frontend/src/features/export/constants.ts b/studio/frontend/src/features/export/constants.ts
index ff7849059..e9c3b8c95 100644
--- a/studio/frontend/src/features/export/constants.ts
+++ b/studio/frontend/src/features/export/constants.ts
@@ -36,14 +36,14 @@ export const EXPORT_METHODS: {
];
export const QUANT_OPTIONS = [
+ { value: "q2_k_l", label: "Q2_K_L", size: "~2.9 GB" },
{ value: "q3_k_m", label: "Q3_K_M", size: "~3.5 GB" },
- { value: "q4_0", label: "Q4_0", size: "~4.1 GB" },
{ value: "q4_k_m", label: "Q4_K_M", size: "~4.8 GB", recommended: true },
- { value: "q5_0", label: "Q5_0", size: "~5.0 GB" },
{ value: "q5_k_m", label: "Q5_K_M", size: "~5.6 GB" },
+ { value: "q6_k", label: "Q6_K", size: "~6.6 GB" },
{ value: "q8_0", label: "Q8_0", size: "~8.2 GB" },
+ { value: "bf16", label: "BF16", size: "~14.2 GB" },
{ value: "f16", label: "F16", size: "~14.2 GB" },
- { value: "f32", label: "F32", size: "~28.4 GB" },
];
export function getEstimatedSize(
diff --git a/unsloth/save.py b/unsloth/save.py
index 3c318fab0..22ae5487d 100644
--- a/unsloth/save.py
+++ b/unsloth/save.py
@@ -122,6 +122,7 @@ ALLOWED_QUANTS = {
"q4_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k": "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+ "q2_k_l": "Q2_K_L with q8_0 output/token embeddings for higher quality than plain Q2_K.",
"q3_k_l": "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m": "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s": "Uses Q3_K for all tensors",
@@ -153,6 +154,89 @@ def print_quantization_methods():
print(f'"{key}" ==> {value}')
+def _quantize_q2_k_l(
+ input_gguf: Union[str, os.PathLike],
+ output_gguf: Union[str, os.PathLike],
+ quantizer_location: Union[str, os.PathLike],
+ n_threads: int,
+ print_output: bool = True,
+):
+ # "Q2_K_L" is a Unsloth-side preset, not a native llama.cpp ftype. It
+ # maps to the `q2_k` ftype with `--output-tensor-type q8_0` and
+ # `--token-embedding-type q8_0` so the output/embedding tensors retain
+ # higher precision than a plain Q2_K quant.
+ command = [
+ str(quantizer_location),
+ "--output-tensor-type",
+ "q8_0",
+ "--token-embedding-type",
+ "q8_0",
+ str(input_gguf),
+ str(output_gguf),
+ "q2_k",
+ str(n_threads),
+ ]
+
+ if print_output:
+ print(
+ "Unsloth: Quantizing as Q2_K_L preset "
+ "(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)..."
+ )
+
+ try:
+ if print_output:
+ with subprocess.Popen(
+ command,
+ shell = False,
+ text = True,
+ stdout = subprocess.PIPE,
+ stderr = subprocess.STDOUT,
+ bufsize = 1,
+ ) as sp:
+ assert sp.stdout is not None
+ for line in sp.stdout:
+ print(line, end = "", flush = True)
+
+ returncode = sp.wait()
+ if returncode != 0:
+ raise RuntimeError(
+ f"Failed to quantize {input_gguf} to q2_k_l: process exited with code {returncode}"
+ )
+ else:
+ subprocess.run(
+ command,
+ shell = False,
+ check = True,
+ capture_output = True,
+ text = True,
+ )
+ except subprocess.CalledProcessError as e:
+ if print_output and hasattr(e, "stdout") and e.stdout:
+ print(e.stdout)
+ error_details = ""
+ if hasattr(e, "stdout") and e.stdout:
+ error_details += f"\nSubprocess stdout:\n{e.stdout}"
+ if hasattr(e, "stderr") and e.stderr:
+ error_details += f"\nSubprocess stderr:\n{e.stderr}"
+ raise RuntimeError(
+ f"Failed to quantize {input_gguf} to q2_k_l: {e}{error_details}"
+ )
+
+ output_path = Path(output_gguf)
+ if not output_path.exists():
+ raise RuntimeError(
+ f"Quantization failed - output file {output_gguf} not created"
+ )
+
+ if print_output:
+ file_size_bytes = output_path.stat().st_size
+ file_size_gb = file_size_bytes / (1024**3)
+ print(
+ f"Unsloth: Successfully quantized to {output_gguf} (size: {file_size_gb:.2f}GB)"
+ )
+ return str(output_gguf)
+
+
def check_if_sentencepiece_model(
model, temporary_location = "_unsloth_sentencepiece_temp"
):
@@ -1305,14 +1389,23 @@ def save_to_gguf(
gguf_directory, f"{model_name}.{quant_method.upper()}.gguf"
)
try:
- # Use the quantize_gguf function we created
- quantized_file = quantize_gguf(
- input_gguf = base_gguf,
- output_gguf = output_location,
- quant_type = quant_method,
- quantizer_location = quantizer_location,
- print_output = print_output,
- )
+ if quant_method == "q2_k_l":
+ quantized_file = _quantize_q2_k_l(
+ input_gguf = base_gguf,
+ output_gguf = output_location,
+ quantizer_location = quantizer_location,
+ n_threads = n_cpus,
+ print_output = print_output,
+ )
+ else:
+ # Use unsloth-zoo's standard quantization for all other methods
+ quantized_file = quantize_gguf(
+ input_gguf = base_gguf,
+ output_gguf = output_location,
+ quant_type = quant_method,
+ quantizer_location = quantizer_location,
+ print_output = print_output,
+ )
all_saved_locations.append(quantized_file)
quants_created = True
except Exception as e:
@@ -1880,6 +1973,7 @@ def unsloth_save_pretrained_gguf(
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+ "q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
@@ -2203,6 +2297,7 @@ def unsloth_push_to_hub_gguf(
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+ "q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",