diff --git a/backend/cpp/ik-llama-cpp/patches/0002-gemma3-default-rms-norm-eps.patch b/backend/cpp/ik-llama-cpp/patches/0002-gemma3-default-rms-norm-eps.patch new file mode 100644 index 000000000..f62dad5aa --- /dev/null +++ b/backend/cpp/ik-llama-cpp/patches/0002-gemma3-default-rms-norm-eps.patch @@ -0,0 +1,38 @@ +From: LocalAI maintainers +Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing + +Some Gemma 3 GGUF files (notably those distributed via the Ollama +registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon` +metadata key. ik_llama.cpp currently requires the key to be present and +fails the entire model load with: + + error loading model hyperparameters: + key not found in model: gemma3.attention.layer_norm_rms_epsilon + +Ollama's own loader silently falls back to ~1e-6 in the same situation, +which is the canonical Gemma 3 default (see google/gemma_pytorch +config.py and the Hugging Face Gemma3Config), so the model still loads +and works correctly. + +Mirror that behavior here: pre-seed the field with the Gemma 3 default +and mark the metadata key as optional. This unblocks Ollama-converted +Gemma 3 models without affecting GGUFs that already carry the key. + +Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414 +--- + src/llama-hparams.cpp | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp +--- a/src/llama-hparams.cpp ++++ b/src/llama-hparams.cpp +@@ -679,7 +679,8 @@ + hparams.rope_freq_scale_train_swa = 1.0f; + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); +- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ++ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key ++ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + + switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_2B; break; diff --git a/backend/cpp/llama-cpp/patches/0001-gemma3-default-rms-norm-eps.patch b/backend/cpp/llama-cpp/patches/0001-gemma3-default-rms-norm-eps.patch new file mode 100644 index 000000000..b98f1183a --- /dev/null +++ b/backend/cpp/llama-cpp/patches/0001-gemma3-default-rms-norm-eps.patch @@ -0,0 +1,38 @@ +From: LocalAI maintainers +Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing + +Some Gemma 3 GGUF files (notably those distributed via the Ollama +registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon` +metadata key. llama.cpp currently requires the key to be present and +fails the entire model load with: + + error loading model hyperparameters: + key not found in model: gemma3.attention.layer_norm_rms_epsilon + +Ollama's own loader silently falls back to ~1e-6 in the same situation, +which is the canonical Gemma 3 default (see google/gemma_pytorch +config.py and the Hugging Face Gemma3Config), so the model still loads +and works correctly. + +Mirror that behavior here: pre-seed the field with the Gemma 3 default +and mark the metadata key as optional. This unblocks Ollama-converted +Gemma 3 models without affecting GGUFs that already carry the key. + +Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414 +--- + src/llama-model.cpp | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/llama-model.cpp b/src/llama-model.cpp +--- a/src/llama-model.cpp ++++ b/src/llama-model.cpp +@@ -1568,7 +1568,8 @@ + + hparams.f_final_logit_softcapping = 0.0f; + ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); +- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ++ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key ++ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + + switch (hparams.n_layer) { + case 18: type = LLM_TYPE_270M; break;