Add Qwen3.6 inference defaults for Studio (#5065)

* Add Qwen3.6 inference defaults for Studio Add qwen3.6 family entry to inference_defaults.json with the recommended sampling parameters from Qwen's documentation: temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0. Without this, Qwen3.6 models fall through to the generic qwen3 pattern which uses different defaults (temperature=0.6, top_p=0.95, no presence_penalty). * Add Qwen3.6-35B-A3B-GGUF to default model lists * Add Qwen3.5/3.6 presence_penalty to thinking toggle and small-model disable logic - Thinking toggle (on-load + button click) now sets presencePenalty: 1.5 for Qwen3.5 and Qwen3.6 models (both thinking-ON and thinking-OFF states) - Small-model thinking-disable check (<9B defaults to no-thinking) extended from Qwen3.5-only to also cover Qwen3.6, in all 3 locations: frontend on-load, frontend refresh, backend llama_cpp.py
2026-04-21 13:37:39 +00:00 · 2026-04-16 11:42:42 -07:00 · 2026-04-16 11:42:42 -07:00 · 0b57884120
commit 0b57884120
parent d56f980452
5 changed files with 26 additions and 13 deletions
--- a/studio/backend/assets/configs/inference_defaults.json
+++ b/studio/backend/assets/configs/inference_defaults.json
@ -1,6 +1,14 @@
 {
  "_comment": "Per-model-family inference parameter defaults. Sources: (1) Ollama params blobs, (2) Existing Unsloth Studio YAML configs. Patterns ordered longest-match-first.",
  "families": {
+    "qwen3.6": {
+      "temperature": 0.7,
+      "top_p": 0.8,
+      "top_k": 20,
+      "min_p": 0.0,
+      "repetition_penalty": 1.0,
+      "presence_penalty": 1.5
+    },
    "qwen3.5": {
      "temperature": 0.7,
      "top_p": 0.8,
@ -369,7 +377,7 @@
    }
  },
  "patterns": [
-    "qwen3.5",
+    "qwen3.6", "qwen3.5",
    "qwen3-coder", "qwen3-next", "qwen3-vl", "qwen3",
    "qwen2.5-coder", "qwen2.5-vl", "qwen2.5-omni", "qwen2.5-math", "qwen2.5",
    "qwen2-vl", "qwen2",
--- a/studio/backend/core/inference/defaults.py
+++ b/studio/backend/core/inference/defaults.py
@ -10,6 +10,7 @@ DEFAULT_MODELS_GGUF = [
    "unsloth/gemma-4-E4B-it-GGUF",
    "unsloth/gemma-4-31B-it-GGUF",
    "unsloth/gemma-4-26B-A4B-it-GGUF",
+    "unsloth/Qwen3.6-35B-A3B-GGUF",
    "unsloth/Qwen3.5-4B-GGUF",
    "unsloth/Qwen3.5-9B-GGUF",
    "unsloth/Qwen3.5-35B-A3B-GGUF",
@ -27,6 +28,7 @@ DEFAULT_MODELS_STANDARD = [
    "unsloth/gemma-4-E4B-it-GGUF",
    "unsloth/gemma-4-31B-it-GGUF",
    "unsloth/gemma-4-26B-A4B-it-GGUF",
+    "unsloth/Qwen3.6-35B-A3B-GGUF",
    "unsloth/Qwen3.5-4B-GGUF",
    "unsloth/Qwen3.5-9B-GGUF",
    "unsloth/Qwen3.5-35B-A3B-GGUF",
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@ -1514,12 +1514,12 @@ class LlamaCppBackend:
                )

            # For reasoning models, set default thinking mode.
-            # Qwen3.5 models below 9B (0.8B, 2B, 4B) disable thinking by default.
+            # Qwen3.5/3.6 models below 9B (0.8B, 2B, 4B) disable thinking by default.
            # Only 9B and larger enable thinking.
            if self._supports_reasoning:
                thinking_default = True
                mid = (model_identifier or "").lower()
-                if "qwen3.5" in mid:
+                if "qwen3.5" in mid or "qwen3.6" in mid:
                    size_val = _extract_model_size_b(mid)
                    if size_val is not None and size_val < 9:
                        thinking_default = False
--- a/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts
+++ b/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts
@ -289,11 +289,11 @@ export function useChatModelRuntime() {
          loadedSpeculativeType: currentSpecType,
        });

-        // Set reasoning default for Qwen3.5 small models
+        // Set reasoning default for Qwen3.5/3.6 small models
        if (supportsReasoning) {
          let reasoningDefault = true;
          const mid = statusRes.active_model.toLowerCase();
-          if (mid.includes("qwen3.5")) {
+          if (mid.includes("qwen3.5") || mid.includes("qwen3.6")) {
            const sizeMatch = mid.match(/(\d+\.?\d*)\s*b/);
            if (sizeMatch && parseFloat(sizeMatch[1]) < 9) {
              reasoningDefault = false;
@ -462,11 +462,11 @@ export function useChatModelRuntime() {
            setParams(
              mergeRecommendedInference(currentParams, loadResponse, modelId),
            );
-            // Qwen3.5 small models (0.8B, 2B, 4B, 9B) disable thinking by default
+            // Qwen3.5/3.6 small models (0.8B, 2B, 4B, 9B) disable thinking by default
            let reasoningDefault = loadResponse.supports_reasoning ?? false;
            if (reasoningDefault) {
              const mid = modelId.toLowerCase();
-              if (mid.includes("qwen3.5")) {
+              if (mid.includes("qwen3.5") || mid.includes("qwen3.6")) {
                const sizeMatch = mid.match(/(\d+\.?\d*)\s*b/);
                if (sizeMatch && parseFloat(sizeMatch[1]) < 9) {
                  reasoningDefault = false;
@ -509,12 +509,14 @@ export function useChatModelRuntime() {
              defaultChatTemplate: loadResponse.chat_template ?? null,
              chatTemplateOverride: null,
            });
-            // Qwen3/3.5: apply thinking-mode-specific params after load
+            // Qwen3/3.5/3.6: apply thinking-mode-specific params after load
            if (modelId.toLowerCase().includes("qwen3") && (loadResponse.supports_reasoning ?? false)) {
              const store = useChatRuntimeStore.getState();
+              const mid = modelId.toLowerCase();
+              const needsPresencePenalty = mid.includes("qwen3.5") || mid.includes("qwen3.6");
              const p = reasoningDefault
-                ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0 }
-                : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0 };
+                ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) }
+                : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) };
              store.setParams({ ...store.params, ...p });
            }
            await refresh();
--- a/studio/frontend/src/features/chat/shared-composer.tsx
+++ b/studio/frontend/src/features/chat/shared-composer.tsx
@ -557,13 +557,14 @@ export function SharedComposer({
              if (reasoningAlwaysOn) return;
              const next = !reasoningEnabled;
              setReasoningEnabled(next);
-              // Qwen3/3.5: adjust params for thinking on/off
+              // Qwen3/3.5/3.6: adjust params for thinking on/off
              const store = useChatRuntimeStore.getState();
              const cp = store.params.checkpoint?.toLowerCase() ?? "";
              if (cp.includes("qwen3")) {
+                const needsPresencePenalty = cp.includes("qwen3.5") || cp.includes("qwen3.6");
                const p = next
-                  ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0 }
-                  : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0 };
+                  ? { temperature: 0.6, topP: 0.95, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) }
+                  : { temperature: 0.7, topP: 0.8, topK: 20, minP: 0.0, ...(needsPresencePenalty ? { presencePenalty: 1.5 } : {}) };
                store.setParams({ ...store.params, ...p });
              }
            }}