From e40dd92bdcca051696e70efa52661d4aeac2f1e1 Mon Sep 17 00:00:00 2001 From: Jordan-HS Date: Fri, 8 May 2026 21:23:04 +1000 Subject: [PATCH] fix(qwen35): support Qwen3.5:9B loading from Ollama GGUF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five fixes to load the Ollama-distributed Qwen3.5:9B Q4_K_M GGUF (sha256:dec52a44...). The model uses a hybrid SSM+attention architecture with per-layer n_head_kv variation, and bundles vision/MTP tensors that are not used for text inference. Changes: 1. llama-model-loader.cpp — get_key_or_arr: respect required=false When the array length mismatches n, previously always threw. Now returns false when required=false, matching the behavior of every other required-flag check in the function. 2. llama-model-loader.cpp — done_getting_tensors: warn on unclaimed tensors The Ollama GGUF bundles vision encoder (v.blk.*) and MTP decoder (mtp.*) tensors alongside the text model (blk.*). Loading text-only leaves these unclaimed. Changed hard error to warning when n_created < n_tensors. 3. llama-model.cpp — QWEN35/QWEN35MOE rope_sections: accept 3 or 4 elements The original code required exactly 4 rope sections. The Ollama GGUF has 3 (the 4th vision section is absent). Using the fixed get_key_or_arr(required=false) to try 4 first, fall back to 3. 4. llama-model.cpp — QWEN35/QWEN35MOE load_tensors: per-layer KV dimensions n_head_kv is a per-layer array (0 for recurrent layers, 4 for attention layers). The outer-scope n_embd_k_gqa used layer 0's value (0), causing wrong tensor shapes. Fixed to use hparams.n_embd_k_gqa(i) and hparams.n_embd_v_gqa(i) per-layer. 5. llama-model.cpp — QWEN35/QWEN35MOE ssm_dt tensor name: remove bias suffix The GGUF key is blk.N.ssm_dt, not blk.N.ssm_dt.bias. 6. models/qwen35.cpp — build_layer_attn: per-layer n_head_kv for reshape The base class n_head_kv member reflects hparams.n_head_kv() which returns layer 0's value (0, recurrent). Fixed to call model.hparams.n_head_kv(il) for the per-layer correct head count before reshaping Kcur and Vcur. Tested: Qwen3.5:9B Q4_K_M, 64K context, q8_0-K/turbo4-V KV cache, RTX 3080 (sm_86, 10 GB). 91 tok/s decode, 2440 tok/s prompt. VRAM ~9749 MiB at 64K context, ~6227 MiB at idle-after-load. Co-Authored-By: Claude Sonnet 4.6 --- src/llama-model-loader.cpp | 10 ++++++++-- src/llama-model.cpp | 21 ++++++++++++++------- src/models/qwen35.cpp | 6 ++++-- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1adff5810b4..1691af7084b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -455,7 +455,10 @@ namespace GGUFMeta { GGUFMeta::GKV::get_kv(metadata, kid); if (n != arr_info.length) { - throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); + if (required) { + throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); + } + return false; } return get_arr(key, result, required); @@ -1317,9 +1320,12 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte } void llama_model_loader::done_getting_tensors() const { - if (n_created != n_tensors) { + if (n_created > n_tensors) { throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } + if (n_created < n_tensors) { + LLAMA_LOG_WARN("%s: %d tensors in file not loaded (multimodal vision/MTP tensors skipped for text-only inference)\n", __func__, n_tensors - n_created); + } if (n_tensors_moved > 0) { LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n", __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f77b2e9217f..793f18da520 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2788,7 +2788,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_QWEN35: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); + // Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4). + // get_key_or_arr with required=false returns false on length mismatch instead of throwing. + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) { + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true); + } // Load linear attention (gated delta net) parameters ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -2818,8 +2822,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); + // Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4). + // get_key_or_arr with required=false returns false on length mismatch instead of throwing. + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) { + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true); + } // Load linear attention (gated delta net) parameters ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -7548,7 +7555,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!hparams.is_recurrent(i)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); // Q/K normalization for attention layers @@ -7560,7 +7567,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0); layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); @@ -7611,7 +7618,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!hparams.is_recurrent(i)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); // Q/K normalization for attention layers @@ -7623,7 +7630,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0); layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 87790f08e4e..0f3b149f501 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -144,8 +144,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); + const int64_t n_head_kv_il = model.hparams.n_head_kv(il); + // Apply K normalization - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_il, n_tokens); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); @@ -156,7 +158,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); cb(gate, "gate_reshaped", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv_il, n_tokens); // Apply MRoPE Qcur = ggml_rope_multi(