|
1165 | 1165 | "outputs": [], |
1166 | 1166 | "source": [ |
1167 | 1167 | "class TrimPosition(BaseModel):\n", |
1168 | | - " start: int = Field(..., description=\"The start position of the trim.\")\n", |
1169 | | - " end: int = Field(..., description=\"The end position of the trim.\")\n", |
| 1168 | + " start: str = Field(..., description=\"The start position of the trim.\")\n", |
| 1169 | + " end: str = Field(..., description=\"The end position of the trim.\")\n", |
1170 | 1170 | "\n", |
1171 | 1171 | "llm_task_trim_content = f\"\"\"\n", |
1172 | | - " A question has been split into its stem and parts.\n", |
1173 | | - " You will be giving a text from a question's stem, extracted from a markdown file by specifying line numbers to extract from.\n", |
1174 | | - " This means that the first and last lines may contain unwanted text, such as:\n", |
1175 | | - " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n", |
1176 | | - " - Text from the previous or next question.\n", |
1177 | | - " Your task is to identify and remove any unwanted text from the start and end of the stem.\n", |
1178 | | - " You will only ever remove text from the start and end of the stem.\n", |
1179 | | - " Provide the position of the first character on the first line that is wanted,\n", |
1180 | | - " and the position of the last character on the last line that is wanted.\n", |
| 1172 | + " You will be given the full text of a question, extracted from a markdown file by line numbers.\n", |
| 1173 | + " The first and last lines may contain unwanted text, such as:\n", |
| 1174 | + " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n", |
| 1175 | + " - Text from the previous or next question.\n", |
| 1176 | + "\n", |
| 1177 | + " Focus only on the actual stem (content) of the question.\n", |
| 1178 | + "\n", |
| 1179 | + " Your task is to, using the full question as guidance:\n", |
| 1180 | + " - From the first line, identify the exact substring where the stem begins, and put it in `start`.\n", |
| 1181 | + " - From the last line, identify the exact substring where the stem ends, and put it in `end`.\n", |
| 1182 | + " - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n", |
| 1183 | + "\n", |
| 1184 | + " We assume that the middle of the stem is always correct, so only the start and end may need trimming.\n", |
| 1185 | + "\n", |
| 1186 | + " Example #1:\n", |
| 1187 | + " first line: \"1. A man is going up hill at 1m/s\"\n", |
| 1188 | + " last line: \"1. A man is going up hill at 1m/s\"\n", |
| 1189 | + "\n", |
| 1190 | + " output:\n", |
| 1191 | + " {{\"keep_first_line_from\": \"A man\",\"keep_last_line_until\": \"1m/s\"}}\n", |
1181 | 1192 | " \"\"\"\n", |
1182 | 1193 | "\n", |
1183 | 1194 | "llm_task_trim_part = f\"\"\"\n", |
1184 | | - " You will be giving the text of a question's sub-question, extracted from a markdown file by specifying line numbers to extract from.\n", |
1185 | | - " This means that the first and last lines may contain unwanted text, such as:\n", |
1186 | | - " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n", |
1187 | | - " - Text from the previous or next question/parts/solution.\n", |
1188 | | - " Your task is to identify and remove any unwanted text from the start and end of the part content.\n", |
1189 | | - " You will only ever remove text from the start and end of the part content.\n", |
1190 | | - " Provide the position of the first character on the first line that is wanted,\n", |
1191 | | - " and the position of the last character on the last line that is wanted.\n", |
| 1195 | + " You will be given the full text of a question, extracted from a markdown file by line numbers.\n", |
| 1196 | + " The first and last lines may contain unwanted text, such as:\n", |
| 1197 | + " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n", |
| 1198 | + " - Text from the previous or next question.\n", |
| 1199 | + "\n", |
| 1200 | + " Focus only on one sub-question (part) of the question, specified later.\n", |
| 1201 | + "\n", |
| 1202 | + " Your task is to, using the full question as guidance:\n", |
| 1203 | + " - From the first line, identify the exact substring where the sub-question begins, and put it in `start`.\n", |
| 1204 | + " - From the last line, identify the exact substring where the sub-question ends, and put it in `end`.\n", |
| 1205 | + " - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n", |
| 1206 | + "\n", |
| 1207 | + " We assume that the middle of the sub-question is always correct, so only the start and end may need trimming.\n", |
| 1208 | + "\n", |
| 1209 | + " Example #1:\n", |
| 1210 | + " first line: \"answer the following question: (a) what is his speed?\"\n", |
| 1211 | + " last line: \"answer the following question: (a) what is his speed?\"\n", |
| 1212 | + "\n", |
| 1213 | + " output:\n", |
| 1214 | + " {{\"keep_first_line_from\": \"what\",\"keep_last_line_until\": \"speed?\"}}\n", |
1192 | 1215 | " \"\"\"\n", |
1193 | 1216 | "\n", |
1194 | 1217 | "llm_task_trim_part_solution = f\"\"\"\n", |
1195 | | - " You will be giving the text of a question's sub-question's solution, extracted from a markdown file by specifying line numbers to extract from.\n", |
1196 | | - " This means that the first and last lines may contain unwanted text, such as:\n", |
1197 | | - " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n", |
1198 | | - " - Text from the previous or next question/parts/solution.\n", |
1199 | | - " Your task is to identify and remove any unwanted text from the start and end of the solution content.\n", |
1200 | | - " You will only ever remove text from the start and end of the solution content.\n", |
1201 | | - " Provide the position of the first character on the first line that is wanted,\n", |
1202 | | - " and the position of the last character on the last line that is wanted.\n", |
| 1218 | + " You will be given the full text of a question, extracted from a markdown file by line numbers.\n", |
| 1219 | + " The first and last lines may contain unwanted text, such as:\n", |
| 1220 | + " - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n", |
| 1221 | + " - Text from the previous or next question.\n", |
| 1222 | + "\n", |
| 1223 | + " Focus only on one part-solution of a sub-question of the question, specified later.\n", |
| 1224 | + "\n", |
| 1225 | + " Your task is to, using the full question as guidance:\n", |
| 1226 | + " - From the first line, identify the exact substring where the part-solution begins, and put it in `start`.\n", |
| 1227 | + " - From the last line, identify the exact substring where the part-solution ends, and put it in `end`.\n", |
| 1228 | + " - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n", |
| 1229 | + "\n", |
| 1230 | + " We assume that the middle of the part-solution is always correct, so only the start and end may need trimming.\n", |
| 1231 | + "\n", |
| 1232 | + " Example #1:\n", |
| 1233 | + " first line: \"A: (a) 2 + 3 = 5\"\n", |
| 1234 | + " last line: \"A: (a) 2 + 3 = 5\"\n", |
| 1235 | + "\n", |
| 1236 | + " output:\n", |
| 1237 | + " {{\"keep_first_line_from\": \"A:\",\"keep_last_line_until\": \"= 5\"}}\n", |
1203 | 1238 | " \"\"\"\n", |
1204 | 1239 | "\n", |
1205 | 1240 | "\n", |
1206 | 1241 | "def trim_question(question: Set_Question_With_Solution) -> Set_Question_With_Solution:\n", |
1207 | 1242 | "\n", |
1208 | 1243 | " def trim_question_content(content_text: str) -> str:\n", |
| 1244 | + " if content_text == \"\":\n", |
| 1245 | + " return content_text\n", |
1209 | 1246 | "\n", |
1210 | | - " content_text = content_text.split(\"\\n\")\n", |
| 1247 | + " # split_content = content_text.split(\"\\n\")\n", |
1211 | 1248 | " content_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n", |
1212 | 1249 | "\n", |
1213 | 1250 | " trim_content_prompt = f\"\"\"\n", |
|
1219 | 1256 | " Full question:\n", |
1220 | 1257 | " {question}\n", |
1221 | 1258 | "\n", |
1222 | | - " First Line of the content:\n", |
1223 | | - " {enumerate(content_text[0])}\n", |
| 1259 | + " Stem (content) of the question:\n", |
| 1260 | + " {content_text}\n", |
1224 | 1261 | "\n", |
1225 | | - " Last Line of the content:\n", |
1226 | | - " {enumerate(content_text[-1])}\n", |
| 1262 | + " Return the JSON now.\n", |
1227 | 1263 | " \"\"\"\n", |
1228 | 1264 | " \n", |
1229 | 1265 | " for attempt_idx in range(3):\n", |
|
1232 | 1268 | "\n", |
1233 | 1269 | " try:\n", |
1234 | 1270 | " parsed_output = content_parser.parse(response.content)\n", |
1235 | | - " start = parsed_output.start\n", |
1236 | | - " end = parsed_output.end\n", |
1237 | | - " print(start, end)\n", |
1238 | | - " print(content_text[0])\n", |
1239 | | - " print(content_text[-1])\n", |
| 1271 | + " start = content_text[parsed_output.start]\n", |
| 1272 | + " end = content_text[parsed_output.end]\n", |
1240 | 1273 | "\n", |
1241 | | - " if start < 0 or end >= len(content_text) or start > end:\n", |
| 1274 | + " if start < 0 or start >= len(split_content[0]) or end < 0 or end >= len(split_content[-1]):\n", |
1242 | 1275 | " raise Exception(\"Invalid trim positions.\")\n", |
1243 | 1276 | " \n", |
1244 | | - " content_text[0] = content_text[0][start:]\n", |
1245 | | - " content_text[-1] = content_text[-1][:end + 1]\n", |
| 1277 | + " split_content[0] = split_content[0][start:]\n", |
| 1278 | + " split_content[-1] = split_content[-1][:end + 1]\n", |
1246 | 1279 | " print(\"LLM response successfully parsed trim positions.\")\n", |
1247 | 1280 | "\n", |
1248 | | - " return \"\\n\".join(content_text).strip()\n", |
| 1281 | + " return \"\\n\".join(split_content).strip()\n", |
1249 | 1282 | " except Exception as e:\n", |
1250 | 1283 | " print(f\"Error parsing LLM response as JSON for trimming content:\")\n", |
1251 | 1284 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
1252 | 1285 | " time.sleep(2)\n", |
1253 | 1286 | " else:\n", |
1254 | 1287 | " print(\"Final LLM Response:\")\n", |
1255 | 1288 | " print(response.content)\n", |
| 1289 | + " print(\"Full content:\" , content_text)\n", |
| 1290 | + " print(\"length of first line:\", len(split_content[0]))\n", |
| 1291 | + " print(\"length of last line:\", len(split_content[-1]))\n", |
1256 | 1292 | " raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming content.\")\n", |
1257 | 1293 | "\n", |
1258 | 1294 | " def trim_question_part(part_text: str) -> str:\n", |
| 1295 | + " if part_text == \"\":\n", |
| 1296 | + " return part_text\n", |
1259 | 1297 | " \n", |
1260 | | - " part_text = part_text.split(\"\\n\")\n", |
| 1298 | + " split_part = part_text.split(\"\\n\")\n", |
1261 | 1299 | " part_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n", |
1262 | 1300 | "\n", |
1263 | 1301 | " trim_part_prompt = f\"\"\"\n", |
|
1270 | 1308 | " {question}\n", |
1271 | 1309 | "\n", |
1272 | 1310 | " First Line of the part:\n", |
1273 | | - " {enumerate(part_text[0])}\n", |
| 1311 | + " {split_part[0]}\n", |
1274 | 1312 | "\n", |
1275 | 1313 | " Last Line of the part:\n", |
1276 | | - " {enumerate(part_text[-1])}\n", |
| 1314 | + " {split_part[-1]}\n", |
| 1315 | + "\n", |
| 1316 | + " Return the JSON now.\n", |
1277 | 1317 | " \"\"\"\n", |
1278 | 1318 | " \n", |
1279 | 1319 | " for attempt_idx in range(3):\n", |
|
1285 | 1325 | " start = parsed_output.start\n", |
1286 | 1326 | " end = parsed_output.end\n", |
1287 | 1327 | "\n", |
1288 | | - " if start < 0 or end >= len(part_text) or start > end:\n", |
| 1328 | + " if start < 0 or start >= len(split_part[0]) or end < 0 or end >= len(split_part[-1]):\n", |
1289 | 1329 | " raise Exception(\"Invalid trim positions.\")\n", |
1290 | 1330 | " \n", |
1291 | | - " part_text[0] = part_text[0][start:]\n", |
1292 | | - " part_text[-1] = part_text[-1][:end + 1]\n", |
| 1331 | + " split_part[0] = split_part[0][start:]\n", |
| 1332 | + " split_part[-1] = split_part[-1][:end + 1]\n", |
1293 | 1333 | " print(\"LLM response successfully parsed trim positions.\")\n", |
1294 | 1334 | "\n", |
1295 | | - " return \"\\n\".join(part_text).strip()\n", |
| 1335 | + " return \"\\n\".join(split_part).strip()\n", |
1296 | 1336 | " except Exception as e:\n", |
1297 | 1337 | " print(f\"Error parsing LLM response as JSON for trimming part:\")\n", |
1298 | 1338 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
1299 | 1339 | " time.sleep(2)\n", |
1300 | 1340 | " else:\n", |
1301 | 1341 | " print(\"Final LLM Response:\")\n", |
1302 | 1342 | " print(response.content)\n", |
| 1343 | + " print(\"Full part:\" , part_text)\n", |
| 1344 | + " print(\"length of first line:\", len(split_part[0]))\n", |
| 1345 | + " print(\"length of last line:\", len(split_part[-1]))\n", |
1303 | 1346 | " raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming part.\")\n", |
1304 | 1347 | "\n", |
1305 | 1348 | " def trim_question_part_solution(solution_text: str) -> str:\n", |
| 1349 | + " if solution_text == \"\":\n", |
| 1350 | + " return solution_text\n", |
1306 | 1351 | " \n", |
1307 | | - " solution_text = solution_text.split(\"\\n\")\n", |
| 1352 | + " split_solution = solution_text.split(\"\\n\")\n", |
1308 | 1353 | " solution_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n", |
1309 | 1354 | "\n", |
1310 | 1355 | " trim_solution_prompt = f\"\"\"\n", |
|
1317 | 1362 | " {question}\n", |
1318 | 1363 | "\n", |
1319 | 1364 | " First Line of the solution part:\n", |
1320 | | - " {enumerate(solution_text[0])}\n", |
| 1365 | + " {split_solution[0]}\n", |
1321 | 1366 | "\n", |
1322 | 1367 | " Last Line of the solution part:\n", |
1323 | | - " {enumerate(solution_text[-1])}\n", |
| 1368 | + " {split_solution[-1]}\n", |
| 1369 | + "\n", |
| 1370 | + " Return the JSON now.\n", |
1324 | 1371 | " \"\"\"\n", |
1325 | 1372 | " \n", |
1326 | 1373 | " for attempt_idx in range(3):\n", |
|
1332 | 1379 | " start = parsed_output.start\n", |
1333 | 1380 | " end = parsed_output.end\n", |
1334 | 1381 | "\n", |
1335 | | - " if start < 0 or end >= len(solution_text) or start > end:\n", |
| 1382 | + " if start < 0 or start >= len(split_solution[0]) or end < 0 or end >= len(split_solution[-1]):\n", |
1336 | 1383 | " raise Exception(\"Invalid trim positions.\")\n", |
1337 | 1384 | " \n", |
1338 | | - " solution_text[0] = solution_text[0][start:]\n", |
1339 | | - " solution_text[-1] = solution_text[-1][:end + 1]\n", |
| 1385 | + " split_solution[0] = split_solution[0][start:]\n", |
| 1386 | + " split_solution[-1] = split_solution[-1][:end + 1]\n", |
1340 | 1387 | " print(\"LLM response successfully parsed trim positions.\")\n", |
1341 | 1388 | "\n", |
1342 | | - " return \"\\n\".join(solution_text).strip()\n", |
| 1389 | + " return \"\\n\".join(split_solution).strip()\n", |
1343 | 1390 | " except Exception as e:\n", |
1344 | 1391 | " print(f\"Error parsing LLM response as JSON for trimming solution part:\")\n", |
1345 | 1392 | " print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n", |
|
1348 | 1395 | " else:\n", |
1349 | 1396 | " print(\"Final LLM Response:\")\n", |
1350 | 1397 | " print(response.content)\n", |
| 1398 | + " print(\"Full solution part:\" , solution_text)\n", |
| 1399 | + " print(\"length of first line:\", len(split_solution[0]))\n", |
| 1400 | + " print(\"length of last line:\", len(split_solution[-1]))\n", |
1351 | 1401 | " raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming solution part.\")\n", |
1352 | 1402 | "\n", |
1353 | 1403 | " question.content = trim_question_content(question.content)\n", |
|
0 commit comments