diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1adff5810b4..1691af7084b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -455,7 +455,10 @@ namespace GGUFMeta { GGUFMeta::GKV::get_kv(metadata, kid); if (n != arr_info.length) { - throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); + if (required) { + throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); + } + return false; } return get_arr(key, result, required); @@ -1317,9 +1320,12 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte } void llama_model_loader::done_getting_tensors() const { - if (n_created != n_tensors) { + if (n_created > n_tensors) { throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } + if (n_created < n_tensors) { + LLAMA_LOG_WARN("%s: %d tensors in file not loaded (multimodal vision/MTP tensors skipped for text-only inference)\n", __func__, n_tensors - n_created); + } if (n_tensors_moved > 0) { LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n", __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f77b2e9217f..793f18da520 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2788,7 +2788,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_QWEN35: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); + // Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4). + // get_key_or_arr with required=false returns false on length mismatch instead of throwing. + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) { + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true); + } // Load linear attention (gated delta net) parameters ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -2818,8 +2822,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); + // Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4). + // get_key_or_arr with required=false returns false on length mismatch instead of throwing. + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) { + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true); + } // Load linear attention (gated delta net) parameters ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -7548,7 +7555,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!hparams.is_recurrent(i)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); // Q/K normalization for attention layers @@ -7560,7 +7567,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0); layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); @@ -7611,7 +7618,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!hparams.is_recurrent(i)) { // Attention layers - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); // Q/K normalization for attention layers @@ -7623,7 +7630,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0); layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 87790f08e4e..0f3b149f501 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -144,8 +144,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); + const int64_t n_head_kv_il = model.hparams.n_head_kv(il); + // Apply K normalization - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_il, n_tokens); Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); @@ -156,7 +158,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); cb(gate, "gate_reshaped", il); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv_il, n_tokens); // Apply MRoPE Qcur = ggml_rope_multi(