Studio: Show LoRA live logs and update GGUF quant options (#5058)

* export: update GGUF quant list and ordering

* gguf: add Q2_K_L quantize flags for output and embeddings

* export: add live console logs for LoRA export flow

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix: stream q2_k_l quantize logs and include subprocess error details

* fix: route Q2_K_L preset to q2_k ftype with q8_0 output+embeddings

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Roland Tannous <115670425+rolandtannous@users.noreply.github.com>
This commit is contained in:
Lee Jackson 2026-04-20 20:14:49 +01:00 committed by GitHub
parent 9c8a079d97
commit d3215ce113
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 138 additions and 19 deletions

View file

@ -272,12 +272,13 @@ export function ExportDialog({
exportSuccess,
exportOutputPath,
}: ExportDialogProps) {
// Live log capture is only meaningful for export methods that run
// a slow subprocess operation with interesting stdout: merged and
// gguf. LoRA adapter export is a fast disk write and would just
// show a blank panel, so we hide it there.
// Live log capture is useful for any export path executed by the
// backend worker, including LoRA adapter-only export.
const showLogPanel =
exportMethod === "merged" || exportMethod === "gguf";
exportMethod === "merged" ||
exportMethod === "gguf" ||
exportMethod === "lora";
const showCompletionScreen = exportSuccess && !showLogPanel;
const { lines: logLines, connected: logConnected, error: logError } =
useExportLogs(exporting && showLogPanel, exportMethod, open);
@ -314,7 +315,7 @@ export function ExportDialog({
className={showLogPanel ? "sm:max-w-2xl" : "sm:max-w-lg"}
onInteractOutside={(e) => { if (exporting) e.preventDefault(); }}
>
{exportSuccess ? (
{showCompletionScreen ? (
<>
<div className="flex flex-col items-center gap-3 py-6">
<div className="flex size-12 items-center justify-center rounded-full bg-emerald-500/10">
@ -460,6 +461,27 @@ export function ExportDialog({
)}
</AnimatePresence>
{/* Success banner for log-driven exports.
Keep users on the log screen after completion so they can
inspect conversion output before closing. */}
{exportSuccess && showLogPanel && (
<div className="flex items-start gap-2 rounded-lg bg-emerald-500/10 p-3 text-sm text-emerald-700 dark:text-emerald-300">
<HugeiconsIcon icon={CheckmarkCircle02Icon} className="mt-0.5 size-4 shrink-0" />
<div className="flex min-w-0 flex-col gap-1">
<span>
{destination === "hub"
? "Export finished and pushed to Hugging Face Hub."
: "Export finished successfully."}
</span>
{exportOutputPath ? (
<code className="select-all break-all font-mono text-[12px] text-foreground/90" title={exportOutputPath}>
{exportOutputPath}
</code>
) : null}
</div>
</div>
)}
{/* Error banner */}
{exportError && (
<div className="flex items-start gap-2 rounded-lg bg-destructive/10 p-3 text-sm text-destructive">
@ -577,14 +599,16 @@ export function ExportDialog({
onClick={() => onOpenChange(false)}
disabled={exporting}
>
Cancel
{exportSuccess ? "Done" : "Cancel"}
</Button>
<Button onClick={onExport} disabled={exporting}>
<Button onClick={onExport} disabled={exporting || exportSuccess}>
{exporting ? (
<span className="flex items-center gap-2">
<Spinner className="size-4" />
Exporting
</span>
) : exportSuccess ? (
"Export Complete"
) : (
"Start Export"
)}

View file

@ -36,14 +36,14 @@ export const EXPORT_METHODS: {
];
export const QUANT_OPTIONS = [
{ value: "q2_k_l", label: "Q2_K_L", size: "~2.9 GB" },
{ value: "q3_k_m", label: "Q3_K_M", size: "~3.5 GB" },
{ value: "q4_0", label: "Q4_0", size: "~4.1 GB" },
{ value: "q4_k_m", label: "Q4_K_M", size: "~4.8 GB", recommended: true },
{ value: "q5_0", label: "Q5_0", size: "~5.0 GB" },
{ value: "q5_k_m", label: "Q5_K_M", size: "~5.6 GB" },
{ value: "q6_k", label: "Q6_K", size: "~6.6 GB" },
{ value: "q8_0", label: "Q8_0", size: "~8.2 GB" },
{ value: "bf16", label: "BF16", size: "~14.2 GB" },
{ value: "f16", label: "F16", size: "~14.2 GB" },
{ value: "f32", label: "F32", size: "~28.4 GB" },
];
export function getEstimatedSize(

View file

@ -122,6 +122,7 @@ ALLOWED_QUANTS = {
"q4_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k": "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q2_k_l": "Q2_K_L with q8_0 output/token embeddings for higher quality than plain Q2_K.",
"q3_k_l": "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m": "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s": "Uses Q3_K for all tensors",
@ -153,6 +154,89 @@ def print_quantization_methods():
print(f'"{key}" ==> {value}')
def _quantize_q2_k_l(
input_gguf: Union[str, os.PathLike],
output_gguf: Union[str, os.PathLike],
quantizer_location: Union[str, os.PathLike],
n_threads: int,
print_output: bool = True,
):
# "Q2_K_L" is a Unsloth-side preset, not a native llama.cpp ftype. It
# maps to the `q2_k` ftype with `--output-tensor-type q8_0` and
# `--token-embedding-type q8_0` so the output/embedding tensors retain
# higher precision than a plain Q2_K quant.
command = [
str(quantizer_location),
"--output-tensor-type",
"q8_0",
"--token-embedding-type",
"q8_0",
str(input_gguf),
str(output_gguf),
"q2_k",
str(n_threads),
]
if print_output:
print(
"Unsloth: Quantizing as Q2_K_L preset "
"(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)..."
)
try:
if print_output:
with subprocess.Popen(
command,
shell = False,
text = True,
stdout = subprocess.PIPE,
stderr = subprocess.STDOUT,
bufsize = 1,
) as sp:
assert sp.stdout is not None
for line in sp.stdout:
print(line, end = "", flush = True)
returncode = sp.wait()
if returncode != 0:
raise RuntimeError(
f"Failed to quantize {input_gguf} to q2_k_l: process exited with code {returncode}"
)
else:
subprocess.run(
command,
shell = False,
check = True,
capture_output = True,
text = True,
)
except subprocess.CalledProcessError as e:
if print_output and hasattr(e, "stdout") and e.stdout:
print(e.stdout)
error_details = ""
if hasattr(e, "stdout") and e.stdout:
error_details += f"\nSubprocess stdout:\n{e.stdout}"
if hasattr(e, "stderr") and e.stderr:
error_details += f"\nSubprocess stderr:\n{e.stderr}"
raise RuntimeError(
f"Failed to quantize {input_gguf} to q2_k_l: {e}{error_details}"
)
output_path = Path(output_gguf)
if not output_path.exists():
raise RuntimeError(
f"Quantization failed - output file {output_gguf} not created"
)
if print_output:
file_size_bytes = output_path.stat().st_size
file_size_gb = file_size_bytes / (1024**3)
print(
f"Unsloth: Successfully quantized to {output_gguf} (size: {file_size_gb:.2f}GB)"
)
return str(output_gguf)
def check_if_sentencepiece_model(
model, temporary_location = "_unsloth_sentencepiece_temp"
):
@ -1305,7 +1389,16 @@ def save_to_gguf(
gguf_directory, f"{model_name}.{quant_method.upper()}.gguf"
)
try:
# Use the quantize_gguf function we created
if quant_method == "q2_k_l":
quantized_file = _quantize_q2_k_l(
input_gguf = base_gguf,
output_gguf = output_location,
quantizer_location = quantizer_location,
n_threads = n_cpus,
print_output = print_output,
)
else:
# Use unsloth-zoo's standard quantization for all other methods
quantized_file = quantize_gguf(
input_gguf = base_gguf,
output_gguf = output_location,
@ -1880,6 +1973,7 @@ def unsloth_save_pretrained_gguf(
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",
@ -2203,6 +2297,7 @@ def unsloth_push_to_hub_gguf(
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
"q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
"q3_k_s" : "Uses Q3_K for all tensors",