From a7e8eb4edfdf5a415ee2bb8997899e7e0809bdaa Mon Sep 17 00:00:00 2001
From: Yushuo Sun <135411736+yushuosun@users.noreply.github.com>
Date: Mon, 29 Jun 2026 00:55:39 +1000
Subject: [PATCH] Fix AttributeError in conversation_to_ids truncation guard

conversation_to_ids() builds the token tensor as `ids` via np.hstack(input_ids),
but the over-length truncation guard still reads `input_ids.shape[-1]`.
`input_ids` is the raw Python list returned by conversation_to_ids_* and has no
`.shape`, so any sample longer than max_length raises
`AttributeError: 'list' object has no attribute 'shape'` and aborts finetuning
instead of truncating. Use the `ids` tensor in both the guard and the warning.
---
 finetune/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finetune/dataset.py b/finetune/dataset.py
index 3d46f165..92765333 100644
--- a/finetune/dataset.py
+++ b/finetune/dataset.py
@@ -144,10 +144,10 @@ def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False
 
     ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32))
     context = torch.from_numpy(np.hstack(context, dtype=np.int8))
-    if input_ids.shape[-1] > max_length:
+    if ids.shape[-1] > max_length:
         ids =ids[:max_length]
         context = context[:max_length]
-        logger.warning(f"The input length ({input_ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated")
+        logger.warning(f"The input length ({ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated")
     
     if torch.all(context):
         logger.error("No tokens available to compute loss.")