From 0b57884120f68a9765dba63b28f9f2d85df6ad9b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 16 Apr 2026 11:42:42 -0700 Subject: [PATCH] Add Qwen3.6 inference defaults for Studio (#5065) * Add Qwen3.6 inference defaults for Studio Add qwen3.6 family entry to inference_defaults.json with the recommended sampling parameters from Qwen's documentation: temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0. Without this, Qwen3.6 models fall through to the generic qwen3 pattern which uses different defaults (temperature=0.6, top_p=0.95, no presence_penalty). * Add Qwen3.6-35B-A3B-GGUF to default model lists * Add Qwen3.5/3.6 presence_penalty to thinking toggle and small-model disable logic - Thinking toggle (on-load + button click) now sets presencePenalty: 1.5 for Qwen3.5 and Qwen3.6 models (both thinking-ON and thinking-OFF states) - Small-model thinking-disable check (<9B defaults to no-thinking) extended from Qwen3.5-only to also cover Qwen3.6, in all 3 locations: frontend on-load, frontend refresh, backend llama_cpp.py --- .../assets/configs/inference_defaults.json | 10 +++++++++- studio/backend/core/inference/defaults.py | 2 ++ studio/backend/core/inference/llama_cpp.py | 4 ++-- .../chat/hooks/use-chat-model-runtime.ts | 16 +++++++++------- .../src/features/chat/shared-composer.tsx | 7 ++++--- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/studio/backend/assets/configs/inference_defaults.json b/studio/backend/assets/configs/inference_defaults.json index 1b4b5381e..1b10b557e 100644 --- a/studio/backend/assets/configs/inference_defaults.json +++ b/studio/backend/assets/configs/inference_defaults.json @@ -1,6 +1,14 @@ { "_comment": "Per-model-family inference parameter defaults. Sources: (1) Ollama params blobs, (2) Existing Unsloth Studio YAML configs. Patterns ordered longest-match-first.", "families": { + "qwen3.6": { + "temperature": 0.7, + "top_p": 0.8, + "top_k": 20, + "min_p": 0.0, + "repetition_penalty": 1.0, + "presence_penalty": 1.5 + }, "qwen3.5": { "temperature": 0.7, "top_p": 0.8, @@ -369,7 +377,7 @@ } }, "patterns": [ - "qwen3.5", + "qwen3.6", "qwen3.5", "qwen3-coder", "qwen3-next", "qwen3-vl", "qwen3", "qwen2.5-coder", "qwen2.5-vl", "qwen2.5-omni", "qwen2.5-math", "qwen2.5", "qwen2-vl", "qwen2", diff --git a/studio/backend/core/inference/defaults.py b/studio/backend/core/inference/defaults.py index f3026ddda..53718c129 100644 --- a/studio/backend/core/inference/defaults.py +++ b/studio/backend/core/inference/defaults.py @@ -10,6 +10,7 @@ DEFAULT_MODELS_GGUF = [ "unsloth/gemma-4-E4B-it-GGUF", "unsloth/gemma-4-31B-it-GGUF", "unsloth/gemma-4-26B-A4B-it-GGUF", + "unsloth/Qwen3.6-35B-A3B-GGUF", "unsloth/Qwen3.5-4B-GGUF", "unsloth/Qwen3.5-9B-GGUF", "unsloth/Qwen3.5-35B-A3B-GGUF", @@ -27,6 +28,7 @@ DEFAULT_MODELS_STANDARD = [ "unsloth/gemma-4-E4B-it-GGUF", "unsloth/gemma-4-31B-it-GGUF", "unsloth/gemma-4-26B-A4B-it-GGUF", + "unsloth/Qwen3.6-35B-A3B-GGUF", "unsloth/Qwen3.5-4B-GGUF", "unsloth/Qwen3.5-9B-GGUF", "unsloth/Qwen3.5-35B-A3B-GGUF", diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 77b58e22f..2e2699530 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1514,12 +1514,12 @@ class LlamaCppBackend: ) # For reasoning models, set default thinking mode. - # Qwen3.5 models below 9B (0.8B, 2B, 4B) disable thinking by default. + # Qwen3.5/3.6 models below 9B (0.8B, 2B, 4B) disable thinking by default. # Only 9B and larger enable thinking. if self._supports_reasoning: thinking_default = True mid = (model_identifier or "").lower() - if "qwen3.5" in mid: + if "qwen3.5" in mid or "qwen3.6" in mid: size_val = _extract_model_size_b(mid) if size_val is not None and size_val < 9: thinking_default = False diff --git a/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts b/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts index 037d3182a..6c870d74a 100644 --- a/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts +++ b/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts @@ -289,11 +289,11 @@ export function useChatModelRuntime() { loadedSpeculativeType: currentSpecType, }); - // Set reasoning default for Qwen3.5 small models + // Set reasoning default for Qwen3.5/3.6 small models if (supportsReasoning) { let reasoningDefault = true; const mid = statusRes.active_model.toLowerCase(); - if (mid.includes("qwen3.5")) { + if (mid.includes("qwen3.5") || mid.includes("qwen3.6")) { const sizeMatch = mid.match(/(\d+\.?\d*)\s*b/); if (sizeMatch && parseFloat(sizeMatch[1]) < 9) { reasoningDefault = false; @@ -462,11 +462,11 @@ export function useChatModelRuntime() { setParams( mergeRecommendedInference(currentParams, loadResponse, modelId), ); - // Qwen3.5 small models (0.8B, 2B, 4B, 9B) disable thinking by default + // Qwen3.5/3.6 small models (0.8B, 2B, 4B, 9B) disable thinking by default let reasoningDefault = loadResponse.supports_reasoning ?? false; if (reasoningDefault) { const mid = modelId.toLowerCase(); - if (mid.includes("qwen3.5")) { + if (mid.includes("qwen3.5") || mid.includes("qwen3.6")) { const sizeMatch = mid.match(/(\d+\.?\d*)\s*b/); if (sizeMatch && parseFloat(sizeMatch[1]) < 9) { reasoningDefault = false; @@ -509,12 +509,14 @@ export function useChatModelRuntime() { defaultChatTemplate: loadResponse.chat_template ?? null, chatTemplateOverride: null, }); - // Qwen3/3.5: apply thinking-mode-specific params after load + // Qwen3/3.5/3.6: apply thinking-mode-specific params after load if (modelId.toLowerCase().includes("qwen3") && (loadResponse.supports_reasoning ?? false)) { const store = useChatRuntimeStore.getState(); + const mid = modelId.toLowerCase(); + const needsPresencePenalty = mid.includes("qwen3.5") || mid.includes("qwen3.6"); const p = reasoningDefault - ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0 } - : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0 }; + ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) } + : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) }; store.setParams({ ...store.params, ...p }); } await refresh(); diff --git a/studio/frontend/src/features/chat/shared-composer.tsx b/studio/frontend/src/features/chat/shared-composer.tsx index f82355526..1c69ade6b 100644 --- a/studio/frontend/src/features/chat/shared-composer.tsx +++ b/studio/frontend/src/features/chat/shared-composer.tsx @@ -557,13 +557,14 @@ export function SharedComposer({ if (reasoningAlwaysOn) return; const next = !reasoningEnabled; setReasoningEnabled(next); - // Qwen3/3.5: adjust params for thinking on/off + // Qwen3/3.5/3.6: adjust params for thinking on/off const store = useChatRuntimeStore.getState(); const cp = store.params.checkpoint?.toLowerCase() ?? ""; if (cp.includes("qwen3")) { + const needsPresencePenalty = cp.includes("qwen3.5") || cp.includes("qwen3.6"); const p = next - ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0 } - : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0 }; + ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) } + : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) }; store.setParams({ ...store.params, ...p }); } }}