From a7e8eb4edfdf5a415ee2bb8997899e7e0809bdaa Mon Sep 17 00:00:00 2001 From: Yushuo Sun <135411736+yushuosun@users.noreply.github.com> Date: Mon, 29 Jun 2026 00:55:39 +1000 Subject: [PATCH] Fix AttributeError in conversation_to_ids truncation guard conversation_to_ids() builds the token tensor as `ids` via np.hstack(input_ids), but the over-length truncation guard still reads `input_ids.shape[-1]`. `input_ids` is the raw Python list returned by conversation_to_ids_* and has no `.shape`, so any sample longer than max_length raises `AttributeError: 'list' object has no attribute 'shape'` and aborts finetuning instead of truncating. Use the `ids` tensor in both the guard and the warning. --- finetune/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finetune/dataset.py b/finetune/dataset.py index 3d46f165..92765333 100644 --- a/finetune/dataset.py +++ b/finetune/dataset.py @@ -144,10 +144,10 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32)) context = torch.from_numpy(np.hstack(context, dtype=np.int8)) - if input_ids.shape[-1] > max_length: + if ids.shape[-1] > max_length: ids =ids[:max_length] context = context[:max_length] - logger.warning(f"The input length ({input_ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated") + logger.warning(f"The input length ({ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated") if torch.all(context): logger.error("No tokens available to compute loss.")