mirror of
https://github.com/unslothai/unsloth
synced 2026-04-21 13:37:39 +00:00
Merge branch 'main' into pre-commit-ci-update-config
This commit is contained in:
commit
4493e84274
3 changed files with 138 additions and 19 deletions
|
|
@ -272,12 +272,13 @@ export function ExportDialog({
|
|||
exportSuccess,
|
||||
exportOutputPath,
|
||||
}: ExportDialogProps) {
|
||||
// Live log capture is only meaningful for export methods that run
|
||||
// a slow subprocess operation with interesting stdout: merged and
|
||||
// gguf. LoRA adapter export is a fast disk write and would just
|
||||
// show a blank panel, so we hide it there.
|
||||
// Live log capture is useful for any export path executed by the
|
||||
// backend worker, including LoRA adapter-only export.
|
||||
const showLogPanel =
|
||||
exportMethod === "merged" || exportMethod === "gguf";
|
||||
exportMethod === "merged" ||
|
||||
exportMethod === "gguf" ||
|
||||
exportMethod === "lora";
|
||||
const showCompletionScreen = exportSuccess && !showLogPanel;
|
||||
|
||||
const { lines: logLines, connected: logConnected, error: logError } =
|
||||
useExportLogs(exporting && showLogPanel, exportMethod, open);
|
||||
|
|
@ -314,7 +315,7 @@ export function ExportDialog({
|
|||
className={showLogPanel ? "sm:max-w-2xl" : "sm:max-w-lg"}
|
||||
onInteractOutside={(e) => { if (exporting) e.preventDefault(); }}
|
||||
>
|
||||
{exportSuccess ? (
|
||||
{showCompletionScreen ? (
|
||||
<>
|
||||
<div className="flex flex-col items-center gap-3 py-6">
|
||||
<div className="flex size-12 items-center justify-center rounded-full bg-emerald-500/10">
|
||||
|
|
@ -460,6 +461,27 @@ export function ExportDialog({
|
|||
)}
|
||||
</AnimatePresence>
|
||||
|
||||
{/* Success banner for log-driven exports.
|
||||
Keep users on the log screen after completion so they can
|
||||
inspect conversion output before closing. */}
|
||||
{exportSuccess && showLogPanel && (
|
||||
<div className="flex items-start gap-2 rounded-lg bg-emerald-500/10 p-3 text-sm text-emerald-700 dark:text-emerald-300">
|
||||
<HugeiconsIcon icon={CheckmarkCircle02Icon} className="mt-0.5 size-4 shrink-0" />
|
||||
<div className="flex min-w-0 flex-col gap-1">
|
||||
<span>
|
||||
{destination === "hub"
|
||||
? "Export finished and pushed to Hugging Face Hub."
|
||||
: "Export finished successfully."}
|
||||
</span>
|
||||
{exportOutputPath ? (
|
||||
<code className="select-all break-all font-mono text-[12px] text-foreground/90" title={exportOutputPath}>
|
||||
{exportOutputPath}
|
||||
</code>
|
||||
) : null}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Error banner */}
|
||||
{exportError && (
|
||||
<div className="flex items-start gap-2 rounded-lg bg-destructive/10 p-3 text-sm text-destructive">
|
||||
|
|
@ -577,14 +599,16 @@ export function ExportDialog({
|
|||
onClick={() => onOpenChange(false)}
|
||||
disabled={exporting}
|
||||
>
|
||||
Cancel
|
||||
{exportSuccess ? "Done" : "Cancel"}
|
||||
</Button>
|
||||
<Button onClick={onExport} disabled={exporting}>
|
||||
<Button onClick={onExport} disabled={exporting || exportSuccess}>
|
||||
{exporting ? (
|
||||
<span className="flex items-center gap-2">
|
||||
<Spinner className="size-4" />
|
||||
Exporting…
|
||||
</span>
|
||||
) : exportSuccess ? (
|
||||
"Export Complete"
|
||||
) : (
|
||||
"Start Export"
|
||||
)}
|
||||
|
|
|
|||
|
|
@ -36,14 +36,14 @@ export const EXPORT_METHODS: {
|
|||
];
|
||||
|
||||
export const QUANT_OPTIONS = [
|
||||
{ value: "q2_k_l", label: "Q2_K_L", size: "~2.9 GB" },
|
||||
{ value: "q3_k_m", label: "Q3_K_M", size: "~3.5 GB" },
|
||||
{ value: "q4_0", label: "Q4_0", size: "~4.1 GB" },
|
||||
{ value: "q4_k_m", label: "Q4_K_M", size: "~4.8 GB", recommended: true },
|
||||
{ value: "q5_0", label: "Q5_0", size: "~5.0 GB" },
|
||||
{ value: "q5_k_m", label: "Q5_K_M", size: "~5.6 GB" },
|
||||
{ value: "q6_k", label: "Q6_K", size: "~6.6 GB" },
|
||||
{ value: "q8_0", label: "Q8_0", size: "~8.2 GB" },
|
||||
{ value: "bf16", label: "BF16", size: "~14.2 GB" },
|
||||
{ value: "f16", label: "F16", size: "~14.2 GB" },
|
||||
{ value: "f32", label: "F32", size: "~28.4 GB" },
|
||||
];
|
||||
|
||||
export function getEstimatedSize(
|
||||
|
|
|
|||
111
unsloth/save.py
111
unsloth/save.py
|
|
@ -122,6 +122,7 @@ ALLOWED_QUANTS = {
|
|||
"q4_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
|
||||
"q5_k_m": "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
|
||||
"q2_k": "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
|
||||
"q2_k_l": "Q2_K_L with q8_0 output/token embeddings for higher quality than plain Q2_K.",
|
||||
"q3_k_l": "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_m": "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_s": "Uses Q3_K for all tensors",
|
||||
|
|
@ -153,6 +154,89 @@ def print_quantization_methods():
|
|||
print(f'"{key}" ==> {value}')
|
||||
|
||||
|
||||
def _quantize_q2_k_l(
|
||||
input_gguf: Union[str, os.PathLike],
|
||||
output_gguf: Union[str, os.PathLike],
|
||||
quantizer_location: Union[str, os.PathLike],
|
||||
n_threads: int,
|
||||
print_output: bool = True,
|
||||
):
|
||||
# "Q2_K_L" is a Unsloth-side preset, not a native llama.cpp ftype. It
|
||||
# maps to the `q2_k` ftype with `--output-tensor-type q8_0` and
|
||||
# `--token-embedding-type q8_0` so the output/embedding tensors retain
|
||||
# higher precision than a plain Q2_K quant.
|
||||
command = [
|
||||
str(quantizer_location),
|
||||
"--output-tensor-type",
|
||||
"q8_0",
|
||||
"--token-embedding-type",
|
||||
"q8_0",
|
||||
str(input_gguf),
|
||||
str(output_gguf),
|
||||
"q2_k",
|
||||
str(n_threads),
|
||||
]
|
||||
|
||||
if print_output:
|
||||
print(
|
||||
"Unsloth: Quantizing as Q2_K_L preset "
|
||||
"(q2_k + --output-tensor-type q8_0 --token-embedding-type q8_0)..."
|
||||
)
|
||||
|
||||
try:
|
||||
if print_output:
|
||||
with subprocess.Popen(
|
||||
command,
|
||||
shell = False,
|
||||
text = True,
|
||||
stdout = subprocess.PIPE,
|
||||
stderr = subprocess.STDOUT,
|
||||
bufsize = 1,
|
||||
) as sp:
|
||||
assert sp.stdout is not None
|
||||
for line in sp.stdout:
|
||||
print(line, end = "", flush = True)
|
||||
|
||||
returncode = sp.wait()
|
||||
if returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Failed to quantize {input_gguf} to q2_k_l: process exited with code {returncode}"
|
||||
)
|
||||
else:
|
||||
subprocess.run(
|
||||
command,
|
||||
shell = False,
|
||||
check = True,
|
||||
capture_output = True,
|
||||
text = True,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
if print_output and hasattr(e, "stdout") and e.stdout:
|
||||
print(e.stdout)
|
||||
error_details = ""
|
||||
if hasattr(e, "stdout") and e.stdout:
|
||||
error_details += f"\nSubprocess stdout:\n{e.stdout}"
|
||||
if hasattr(e, "stderr") and e.stderr:
|
||||
error_details += f"\nSubprocess stderr:\n{e.stderr}"
|
||||
raise RuntimeError(
|
||||
f"Failed to quantize {input_gguf} to q2_k_l: {e}{error_details}"
|
||||
)
|
||||
|
||||
output_path = Path(output_gguf)
|
||||
if not output_path.exists():
|
||||
raise RuntimeError(
|
||||
f"Quantization failed - output file {output_gguf} not created"
|
||||
)
|
||||
|
||||
if print_output:
|
||||
file_size_bytes = output_path.stat().st_size
|
||||
file_size_gb = file_size_bytes / (1024**3)
|
||||
print(
|
||||
f"Unsloth: Successfully quantized to {output_gguf} (size: {file_size_gb:.2f}GB)"
|
||||
)
|
||||
return str(output_gguf)
|
||||
|
||||
|
||||
def check_if_sentencepiece_model(
|
||||
model, temporary_location = "_unsloth_sentencepiece_temp"
|
||||
):
|
||||
|
|
@ -1305,14 +1389,23 @@ def save_to_gguf(
|
|||
gguf_directory, f"{model_name}.{quant_method.upper()}.gguf"
|
||||
)
|
||||
try:
|
||||
# Use the quantize_gguf function we created
|
||||
quantized_file = quantize_gguf(
|
||||
input_gguf = base_gguf,
|
||||
output_gguf = output_location,
|
||||
quant_type = quant_method,
|
||||
quantizer_location = quantizer_location,
|
||||
print_output = print_output,
|
||||
)
|
||||
if quant_method == "q2_k_l":
|
||||
quantized_file = _quantize_q2_k_l(
|
||||
input_gguf = base_gguf,
|
||||
output_gguf = output_location,
|
||||
quantizer_location = quantizer_location,
|
||||
n_threads = n_cpus,
|
||||
print_output = print_output,
|
||||
)
|
||||
else:
|
||||
# Use unsloth-zoo's standard quantization for all other methods
|
||||
quantized_file = quantize_gguf(
|
||||
input_gguf = base_gguf,
|
||||
output_gguf = output_location,
|
||||
quant_type = quant_method,
|
||||
quantizer_location = quantizer_location,
|
||||
print_output = print_output,
|
||||
)
|
||||
all_saved_locations.append(quantized_file)
|
||||
quants_created = True
|
||||
except Exception as e:
|
||||
|
|
@ -1880,6 +1973,7 @@ def unsloth_save_pretrained_gguf(
|
|||
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
|
||||
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
|
||||
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
|
||||
"q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
|
||||
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_s" : "Uses Q3_K for all tensors",
|
||||
|
|
@ -2203,6 +2297,7 @@ def unsloth_push_to_hub_gguf(
|
|||
"q4_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K",
|
||||
"q5_k_m" : "Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K",
|
||||
"q2_k" : "Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.",
|
||||
"q2_k_l" : "Q2_K_L with --output-tensor-type q8_0 --token-embedding-type q8_0.",
|
||||
"q3_k_l" : "Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_m" : "Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K",
|
||||
"q3_k_s" : "Uses Q3_K for all tensors",
|
||||
|
|
|
|||
Loading…
Reference in a new issue