Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,10 @@ namespace GGUFMeta {
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);

if (n != arr_info.length) {
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
if (required) {
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
}
return false;
}

return get_arr(key, result, required);
Expand Down Expand Up @@ -1317,9 +1320,12 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte
}

void llama_model_loader::done_getting_tensors() const {
if (n_created != n_tensors) {
if (n_created > n_tensors) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
if (n_created < n_tensors) {
LLAMA_LOG_WARN("%s: %d tensors in file not loaded (multimodal vision/MTP tensors skipped for text-only inference)\n", __func__, n_tensors - n_created);
}
if (n_tensors_moved > 0) {
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
Expand Down
21 changes: 14 additions & 7 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2788,7 +2788,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case LLM_ARCH_QWEN35:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
// Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4).
// get_key_or_arr with required=false returns false on length mismatch instead of throwing.
if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) {
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true);
}

// Load linear attention (gated delta net) parameters
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
Expand Down Expand Up @@ -2818,8 +2822,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
// Accept 3 or 4 rope sections (Ollama qwen3.5:9b GGUF has 3; some variants have 4).
// get_key_or_arr with required=false returns false on length mismatch instead of throwing.
if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false)) {
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 3, true);
}

// Load linear attention (gated delta net) parameters
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
Expand Down Expand Up @@ -7548,7 +7555,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

if (!hparams.is_recurrent(i)) {
// Attention layers
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);

// Q/K normalization for attention layers
Expand All @@ -7560,7 +7567,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
Expand Down Expand Up @@ -7611,7 +7618,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

if (!hparams.is_recurrent(i)) {
// Attention layers
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, hparams.n_embd_k_gqa(i), hparams.n_embd_v_gqa(i), 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);

// Q/K normalization for attention layers
Expand All @@ -7623,7 +7630,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
Expand Down
6 changes: 4 additions & 2 deletions src/models/qwen35.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);

const int64_t n_head_kv_il = model.hparams.n_head_kv(il);

// Apply K normalization
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_il, n_tokens);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);

Expand All @@ -156,7 +158,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cb(gate, "gate_reshaped", il);

Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv_il, n_tokens);

// Apply MRoPE
Qcur = ggml_rope_multi(
Expand Down