Merge branch 'main' into pre-commit-ci-update-config

2026-04-21 13:37:39 +00:00 · 2026-04-20 23:15:20 +04:00 · 2026-04-20 23:15:20 +04:00 · 4493e84274
commit 4493e84274
parent c0cf611104 d3215ce113
3 changed files with 138 additions and 19 deletions
--- a/studio/frontend/src/features/export/components/export-dialog.tsx
+++ b/studio/frontend/src/features/export/components/export-dialog.tsx
@ -272,12 +272,13 @@ export function ExportDialog({
  exportSuccess,
  exportOutputPath,
 }: ExportDialogProps) {
-  // Live log capture is only meaningful for export methods that run
-  // a slow subprocess operation with interesting stdout: merged and
-  // gguf. LoRA adapter export is a fast disk write and would just
-  // show a blank panel, so we hide it there.
+  // Live log capture is useful for any export path executed by the
+  // backend worker, including LoRA adapter-only export.
  const showLogPanel =
-    exportMethod === "merged" || exportMethod === "gguf";
+    exportMethod === "merged" ||
+    exportMethod === "gguf" ||
+    exportMethod === "lora";
+  const showCompletionScreen = exportSuccess && !showLogPanel;

  const { lines: logLines, connected: logConnected, error: logError } =
    useExportLogs(exporting && showLogPanel, exportMethod, open);
@ -314,7 +315,7 @@ export function ExportDialog({
        className={showLogPanel ? "sm:max-w-2xl" : "sm:max-w-lg"}
        onInteractOutside={(e) => { if (exporting) e.preventDefault(); }}
      >
-        {exportSuccess ? (
+        {showCompletionScreen ? (
          <>
            <div className="flex flex-col items-center gap-3 py-6">
              <div className="flex size-12 items-center justify-center rounded-full bg-emerald-500/10">
@ -460,6 +461,27 @@ export function ExportDialog({
              )}
            </AnimatePresence>

+            {/* Success banner for log-driven exports.
+                Keep users on the log screen after completion so they can
+                inspect conversion output before closing. */}
+            {exportSuccess && showLogPanel && (
+              <div className="flex items-start gap-2 rounded-lg bg-emerald-500/10 p-3 text-sm text-emerald-700 dark:text-emerald-300">
+                <HugeiconsIcon icon={CheckmarkCircle02Icon} className="mt-0.5 size-4 shrink-0" />
+                <div className="flex min-w-0 flex-col gap-1">
+                  <span>
+                    {destination === "hub"
+                      ? "Export finished and pushed to Hugging Face Hub."
+                      : "Export finished successfully."}
+                  </span>
+                  {exportOutputPath ? (
+                    <code className="select-all break-all font-mono text-[12px] text-foreground/90" title={exportOutputPath}>
+                      {exportOutputPath}
+                    </code>
+                  ) : null}
+                </div>
+              </div>
+            )}
+
            {/* Error banner */}
            {exportError && (
              <div className="flex items-start gap-2 rounded-lg bg-destructive/10 p-3 text-sm text-destructive">
@ -577,14 +599,16 @@ export function ExportDialog({
                onClick={() => onOpenChange(false)}
                disabled={exporting}
              >
-                Cancel
+                {exportSuccess ? "Done" : "Cancel"}
              </Button>
-              <Button onClick={onExport} disabled={exporting}>
+              <Button onClick={onExport} disabled={exporting || exportSuccess}>
                {exporting ? (
                  <span className="flex items-center gap-2">
                    <Spinner className="size-4" />
                    Exporting…
                  </span>
+                ) : exportSuccess ? (
+                  "Export Complete"
                ) : (
                  "Start Export"
                )}
--- a/studio/frontend/src/features/export/constants.ts
+++ b/studio/frontend/src/features/export/constants.ts
@ -36,14 +36,14 @@ export const EXPORT_METHODS: {
 ];

 export const QUANT_OPTIONS = [
+  { value: "q2_k_l", label: "Q2_K_L", size: "~2.9 GB" },
  { value: "q3_k_m", label: "Q3_K_M", size: "~3.5 GB" },
-  { value: "q4_0", label: "Q4_0", size: "~4.1 GB" },
  { value: "q4_k_m", label: "Q4_K_M", size: "~4.8 GB", recommended: true },
-  { value: "q5_0", label: "Q5_0", size: "~5.0 GB" },
  { value: "q5_k_m", label: "Q5_K_M", size: "~5.6 GB" },
+  { value: "q6_k", label: "Q6_K", size: "~6.6 GB" },
  { value: "q8_0", label: "Q8_0", size: "~8.2 GB" },
+  { value: "bf16", label: "BF16", size: "~14.2 GB" },
  { value: "f16", label: "F16", size: "~14.2 GB" },
-  { value: "f32", label: "F32", size: "~28.4 GB" },
 ];

 export function getEstimatedSize(
--- a/unsloth/save.py
+++ b/unsloth/save.py
@ -122,6 +122,7 @@ ALLOWED_QUANTS = {
    "q4_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
    "q5_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
    "q2_k": "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q2_k_l": "Q2_K_L with q8_0 output/token embeddings for higher quality than plain Q2_K.",
    "q3_k_l": "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_m": "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_s": "Uses Q3_K for all tensors",
@ -153,6 +154,89 @@ def print_quantization_methods():
        print(f'"{key}"  ==> {value}')


+def _quantize_q2_k_l(
+    input_gguf: Union[str, os.PathLike],
+    output_gguf: Union[str, os.PathLike],
+    quantizer_location: Union[str, os.PathLike],
+    n_threads: int,
+    print_output: bool = True,
+):
+    # "Q2_K_L" is a Unsloth-side preset, not a native llama.cpp ftype. It
+    # maps to the `q2_k` ftype with `--output-tensor-type q8_0` and
+    # `--token-embedding-type q8_0` so the output/embedding tensors retain
+    # higher precision than a plain Q2_K quant.
+    command = [
+        str(quantizer_location),
+        "--output-tensor-type",
+        "q8_0",
+        "--token-embedding-type",
+        "q8_0",
+        str(input_gguf),
+        str(output_gguf),
+        "q2_k",
+        str(n_threads),
+    ]
+
+    if print_output:
+        print(
+            "Unsloth: Quantizing as Q2_K_L preset "
+            "(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)..."
+        )
+
+    try:
+        if print_output:
+            with subprocess.Popen(
+                command,
+                shell = False,
+                text = True,
+                stdout = subprocess.PIPE,
+                stderr = subprocess.STDOUT,
+                bufsize = 1,
+            ) as sp:
+                assert sp.stdout is not None
+                for line in sp.stdout:
+                    print(line, end = "", flush = True)
+
+                returncode = sp.wait()
+                if returncode != 0:
+                    raise RuntimeError(
+                        f"Failed to quantize {input_gguf} to q2_k_l: process exited with code {returncode}"
+                    )
+        else:
+            subprocess.run(
+                command,
+                shell = False,
+                check = True,
+                capture_output = True,
+                text = True,
+            )
+    except subprocess.CalledProcessError as e:
+        if print_output and hasattr(e, "stdout") and e.stdout:
+            print(e.stdout)
+        error_details = ""
+        if hasattr(e, "stdout") and e.stdout:
+            error_details += f"\nSubprocess stdout:\n{e.stdout}"
+        if hasattr(e, "stderr") and e.stderr:
+            error_details += f"\nSubprocess stderr:\n{e.stderr}"
+        raise RuntimeError(
+            f"Failed to quantize {input_gguf} to q2_k_l: {e}{error_details}"
+        )
+
+    output_path = Path(output_gguf)
+    if not output_path.exists():
+        raise RuntimeError(
+            f"Quantization failed - output file {output_gguf} not created"
+        )
+
+    if print_output:
+        file_size_bytes = output_path.stat().st_size
+        file_size_gb = file_size_bytes / (1024**3)
+        print(
+            f"Unsloth: Successfully quantized to {output_gguf} (size: {file_size_gb:.2f}GB)"
+        )
+    return str(output_gguf)
+
+
 def check_if_sentencepiece_model(
    model, temporary_location = "_unsloth_sentencepiece_temp"
 ):
@ -1305,14 +1389,23 @@ def save_to_gguf(
                    gguf_directory, f"{model_name}.{quant_method.upper()}.gguf"
                )
                try:
-                    # Use the quantize_gguf function we created
-                    quantized_file = quantize_gguf(
-                        input_gguf = base_gguf,
-                        output_gguf = output_location,
-                        quant_type = quant_method,
-                        quantizer_location = quantizer_location,
-                        print_output = print_output,
-                    )
+                    if quant_method == "q2_k_l":
+                        quantized_file = _quantize_q2_k_l(
+                            input_gguf = base_gguf,
+                            output_gguf = output_location,
+                            quantizer_location = quantizer_location,
+                            n_threads = n_cpus,
+                            print_output = print_output,
+                        )
+                    else:
+                        # Use unsloth-zoo's standard quantization for all other methods
+                        quantized_file = quantize_gguf(
+                            input_gguf = base_gguf,
+                            output_gguf = output_location,
+                            quant_type = quant_method,
+                            quantizer_location = quantizer_location,
+                            print_output = print_output,
+                        )
                    all_saved_locations.append(quantized_file)
                    quants_created = True
                except Exception as e:
@ -1880,6 +1973,7 @@ def unsloth_save_pretrained_gguf(
    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q2_k_l"  : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_s"  : "Uses Q3_K for all tensors",
@ -2203,6 +2297,7 @@ def unsloth_push_to_hub_gguf(
    "q4_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
    "q5_k_m"  : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
    "q2_k"    : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
+    "q2_k_l"  : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
    "q3_k_l"  : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_m"  : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
    "q3_k_s"  : "Uses Q3_K for all tensors",