Skip to content

Commit 4ecf73e

Browse files
committed
Improved unwanted text removal to use regex-like strategy
1 parent d45d519 commit 4ecf73e

File tree

2 files changed

+108
-61
lines changed

2 files changed

+108
-61
lines changed

conversion2025/mathpix_to_llm_with_lines_to_api.ipynb

Lines changed: 105 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,49 +1165,86 @@
11651165
"outputs": [],
11661166
"source": [
11671167
"class TrimPosition(BaseModel):\n",
1168-
" start: int = Field(..., description=\"The start position of the trim.\")\n",
1169-
" end: int = Field(..., description=\"The end position of the trim.\")\n",
1168+
" start: str = Field(..., description=\"The start position of the trim.\")\n",
1169+
" end: str = Field(..., description=\"The end position of the trim.\")\n",
11701170
"\n",
11711171
"llm_task_trim_content = f\"\"\"\n",
1172-
" A question has been split into its stem and parts.\n",
1173-
" You will be giving a text from a question's stem, extracted from a markdown file by specifying line numbers to extract from.\n",
1174-
" This means that the first and last lines may contain unwanted text, such as:\n",
1175-
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
1176-
" - Text from the previous or next question.\n",
1177-
" Your task is to identify and remove any unwanted text from the start and end of the stem.\n",
1178-
" You will only ever remove text from the start and end of the stem.\n",
1179-
" Provide the position of the first character on the first line that is wanted,\n",
1180-
" and the position of the last character on the last line that is wanted.\n",
1172+
" You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
1173+
" The first and last lines may contain unwanted text, such as:\n",
1174+
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
1175+
" - Text from the previous or next question.\n",
1176+
"\n",
1177+
" Focus only on the actual stem (content) of the question.\n",
1178+
"\n",
1179+
" Your task is to, using the full question as guidance:\n",
1180+
" - From the first line, identify the exact substring where the stem begins, and put it in `start`.\n",
1181+
" - From the last line, identify the exact substring where the stem ends, and put it in `end`.\n",
1182+
" - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
1183+
"\n",
1184+
" We assume that the middle of the stem is always correct, so only the start and end may need trimming.\n",
1185+
"\n",
1186+
" Example #1:\n",
1187+
" first line: \"1. A man is going up hill at 1m/s\"\n",
1188+
" last line: \"1. A man is going up hill at 1m/s\"\n",
1189+
"\n",
1190+
" output:\n",
1191+
" {{\"keep_first_line_from\": \"A man\",\"keep_last_line_until\": \"1m/s\"}}\n",
11811192
" \"\"\"\n",
11821193
"\n",
11831194
"llm_task_trim_part = f\"\"\"\n",
1184-
" You will be giving the text of a question's sub-question, extracted from a markdown file by specifying line numbers to extract from.\n",
1185-
" This means that the first and last lines may contain unwanted text, such as:\n",
1186-
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
1187-
" - Text from the previous or next question/parts/solution.\n",
1188-
" Your task is to identify and remove any unwanted text from the start and end of the part content.\n",
1189-
" You will only ever remove text from the start and end of the part content.\n",
1190-
" Provide the position of the first character on the first line that is wanted,\n",
1191-
" and the position of the last character on the last line that is wanted.\n",
1195+
" You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
1196+
" The first and last lines may contain unwanted text, such as:\n",
1197+
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
1198+
" - Text from the previous or next question.\n",
1199+
"\n",
1200+
" Focus only on one sub-question (part) of the question, specified later.\n",
1201+
"\n",
1202+
" Your task is to, using the full question as guidance:\n",
1203+
" - From the first line, identify the exact substring where the sub-question begins, and put it in `start`.\n",
1204+
" - From the last line, identify the exact substring where the sub-question ends, and put it in `end`.\n",
1205+
" - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
1206+
"\n",
1207+
" We assume that the middle of the sub-question is always correct, so only the start and end may need trimming.\n",
1208+
"\n",
1209+
" Example #1:\n",
1210+
" first line: \"answer the following question: (a) what is his speed?\"\n",
1211+
" last line: \"answer the following question: (a) what is his speed?\"\n",
1212+
"\n",
1213+
" output:\n",
1214+
" {{\"keep_first_line_from\": \"what\",\"keep_last_line_until\": \"speed?\"}}\n",
11921215
" \"\"\"\n",
11931216
"\n",
11941217
"llm_task_trim_part_solution = f\"\"\"\n",
1195-
" You will be giving the text of a question's sub-question's solution, extracted from a markdown file by specifying line numbers to extract from.\n",
1196-
" This means that the first and last lines may contain unwanted text, such as:\n",
1197-
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
1198-
" - Text from the previous or next question/parts/solution.\n",
1199-
" Your task is to identify and remove any unwanted text from the start and end of the solution content.\n",
1200-
" You will only ever remove text from the start and end of the solution content.\n",
1201-
" Provide the position of the first character on the first line that is wanted,\n",
1202-
" and the position of the last character on the last line that is wanted.\n",
1218+
" You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
1219+
" The first and last lines may contain unwanted text, such as:\n",
1220+
" - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
1221+
" - Text from the previous or next question.\n",
1222+
"\n",
1223+
" Focus only on one part-solution of a sub-question of the question, specified later.\n",
1224+
"\n",
1225+
" Your task is to, using the full question as guidance:\n",
1226+
" - From the first line, identify the exact substring where the part-solution begins, and put it in `start`.\n",
1227+
" - From the last line, identify the exact substring where the part-solution ends, and put it in `end`.\n",
1228+
" - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
1229+
"\n",
1230+
" We assume that the middle of the part-solution is always correct, so only the start and end may need trimming.\n",
1231+
"\n",
1232+
" Example #1:\n",
1233+
" first line: \"A: (a) 2 + 3 = 5\"\n",
1234+
" last line: \"A: (a) 2 + 3 = 5\"\n",
1235+
"\n",
1236+
" output:\n",
1237+
" {{\"keep_first_line_from\": \"A:\",\"keep_last_line_until\": \"= 5\"}}\n",
12031238
" \"\"\"\n",
12041239
"\n",
12051240
"\n",
12061241
"def trim_question(question: Set_Question_With_Solution) -> Set_Question_With_Solution:\n",
12071242
"\n",
12081243
" def trim_question_content(content_text: str) -> str:\n",
1244+
" if content_text == \"\":\n",
1245+
" return content_text\n",
12091246
"\n",
1210-
" content_text = content_text.split(\"\\n\")\n",
1247+
" # split_content = content_text.split(\"\\n\")\n",
12111248
" content_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
12121249
"\n",
12131250
" trim_content_prompt = f\"\"\"\n",
@@ -1219,11 +1256,10 @@
12191256
" Full question:\n",
12201257
" {question}\n",
12211258
"\n",
1222-
" First Line of the content:\n",
1223-
" {enumerate(content_text[0])}\n",
1259+
" Stem (content) of the question:\n",
1260+
" {content_text}\n",
12241261
"\n",
1225-
" Last Line of the content:\n",
1226-
" {enumerate(content_text[-1])}\n",
1262+
" Return the JSON now.\n",
12271263
" \"\"\"\n",
12281264
" \n",
12291265
" for attempt_idx in range(3):\n",
@@ -1232,32 +1268,34 @@
12321268
"\n",
12331269
" try:\n",
12341270
" parsed_output = content_parser.parse(response.content)\n",
1235-
" start = parsed_output.start\n",
1236-
" end = parsed_output.end\n",
1237-
" print(start, end)\n",
1238-
" print(content_text[0])\n",
1239-
" print(content_text[-1])\n",
1271+
" start = content_text[parsed_output.start]\n",
1272+
" end = content_text[parsed_output.end]\n",
12401273
"\n",
1241-
" if start < 0 or end >= len(content_text) or start > end:\n",
1274+
" if start < 0 or start >= len(split_content[0]) or end < 0 or end >= len(split_content[-1]):\n",
12421275
" raise Exception(\"Invalid trim positions.\")\n",
12431276
" \n",
1244-
" content_text[0] = content_text[0][start:]\n",
1245-
" content_text[-1] = content_text[-1][:end + 1]\n",
1277+
" split_content[0] = split_content[0][start:]\n",
1278+
" split_content[-1] = split_content[-1][:end + 1]\n",
12461279
" print(\"LLM response successfully parsed trim positions.\")\n",
12471280
"\n",
1248-
" return \"\\n\".join(content_text).strip()\n",
1281+
" return \"\\n\".join(split_content).strip()\n",
12491282
" except Exception as e:\n",
12501283
" print(f\"Error parsing LLM response as JSON for trimming content:\")\n",
12511284
" print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
12521285
" time.sleep(2)\n",
12531286
" else:\n",
12541287
" print(\"Final LLM Response:\")\n",
12551288
" print(response.content)\n",
1289+
" print(\"Full content:\" , content_text)\n",
1290+
" print(\"length of first line:\", len(split_content[0]))\n",
1291+
" print(\"length of last line:\", len(split_content[-1]))\n",
12561292
" raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming content.\")\n",
12571293
"\n",
12581294
" def trim_question_part(part_text: str) -> str:\n",
1295+
" if part_text == \"\":\n",
1296+
" return part_text\n",
12591297
" \n",
1260-
" part_text = part_text.split(\"\\n\")\n",
1298+
" split_part = part_text.split(\"\\n\")\n",
12611299
" part_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
12621300
"\n",
12631301
" trim_part_prompt = f\"\"\"\n",
@@ -1270,10 +1308,12 @@
12701308
" {question}\n",
12711309
"\n",
12721310
" First Line of the part:\n",
1273-
" {enumerate(part_text[0])}\n",
1311+
" {split_part[0]}\n",
12741312
"\n",
12751313
" Last Line of the part:\n",
1276-
" {enumerate(part_text[-1])}\n",
1314+
" {split_part[-1]}\n",
1315+
"\n",
1316+
" Return the JSON now.\n",
12771317
" \"\"\"\n",
12781318
" \n",
12791319
" for attempt_idx in range(3):\n",
@@ -1285,26 +1325,31 @@
12851325
" start = parsed_output.start\n",
12861326
" end = parsed_output.end\n",
12871327
"\n",
1288-
" if start < 0 or end >= len(part_text) or start > end:\n",
1328+
" if start < 0 or start >= len(split_part[0]) or end < 0 or end >= len(split_part[-1]):\n",
12891329
" raise Exception(\"Invalid trim positions.\")\n",
12901330
" \n",
1291-
" part_text[0] = part_text[0][start:]\n",
1292-
" part_text[-1] = part_text[-1][:end + 1]\n",
1331+
" split_part[0] = split_part[0][start:]\n",
1332+
" split_part[-1] = split_part[-1][:end + 1]\n",
12931333
" print(\"LLM response successfully parsed trim positions.\")\n",
12941334
"\n",
1295-
" return \"\\n\".join(part_text).strip()\n",
1335+
" return \"\\n\".join(split_part).strip()\n",
12961336
" except Exception as e:\n",
12971337
" print(f\"Error parsing LLM response as JSON for trimming part:\")\n",
12981338
" print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
12991339
" time.sleep(2)\n",
13001340
" else:\n",
13011341
" print(\"Final LLM Response:\")\n",
13021342
" print(response.content)\n",
1343+
" print(\"Full part:\" , part_text)\n",
1344+
" print(\"length of first line:\", len(split_part[0]))\n",
1345+
" print(\"length of last line:\", len(split_part[-1]))\n",
13031346
" raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming part.\")\n",
13041347
"\n",
13051348
" def trim_question_part_solution(solution_text: str) -> str:\n",
1349+
" if solution_text == \"\":\n",
1350+
" return solution_text\n",
13061351
" \n",
1307-
" solution_text = solution_text.split(\"\\n\")\n",
1352+
" split_solution = solution_text.split(\"\\n\")\n",
13081353
" solution_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
13091354
"\n",
13101355
" trim_solution_prompt = f\"\"\"\n",
@@ -1317,10 +1362,12 @@
13171362
" {question}\n",
13181363
"\n",
13191364
" First Line of the solution part:\n",
1320-
" {enumerate(solution_text[0])}\n",
1365+
" {split_solution[0]}\n",
13211366
"\n",
13221367
" Last Line of the solution part:\n",
1323-
" {enumerate(solution_text[-1])}\n",
1368+
" {split_solution[-1]}\n",
1369+
"\n",
1370+
" Return the JSON now.\n",
13241371
" \"\"\"\n",
13251372
" \n",
13261373
" for attempt_idx in range(3):\n",
@@ -1332,14 +1379,14 @@
13321379
" start = parsed_output.start\n",
13331380
" end = parsed_output.end\n",
13341381
"\n",
1335-
" if start < 0 or end >= len(solution_text) or start > end:\n",
1382+
" if start < 0 or start >= len(split_solution[0]) or end < 0 or end >= len(split_solution[-1]):\n",
13361383
" raise Exception(\"Invalid trim positions.\")\n",
13371384
" \n",
1338-
" solution_text[0] = solution_text[0][start:]\n",
1339-
" solution_text[-1] = solution_text[-1][:end + 1]\n",
1385+
" split_solution[0] = split_solution[0][start:]\n",
1386+
" split_solution[-1] = split_solution[-1][:end + 1]\n",
13401387
" print(\"LLM response successfully parsed trim positions.\")\n",
13411388
"\n",
1342-
" return \"\\n\".join(solution_text).strip()\n",
1389+
" return \"\\n\".join(split_solution).strip()\n",
13431390
" except Exception as e:\n",
13441391
" print(f\"Error parsing LLM response as JSON for trimming solution part:\")\n",
13451392
" print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
@@ -1348,6 +1395,9 @@
13481395
" else:\n",
13491396
" print(\"Final LLM Response:\")\n",
13501397
" print(response.content)\n",
1398+
" print(\"Full solution part:\" , solution_text)\n",
1399+
" print(\"length of first line:\", len(split_solution[0]))\n",
1400+
" print(\"length of last line:\", len(split_solution[-1]))\n",
13511401
" raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming solution part.\")\n",
13521402
"\n",
13531403
" question.content = trim_question_content(question.content)\n",

conversion2025/testing_and_prototype.ipynb

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,12 @@
138138
"llm_mini = ChatOpenAI(\n",
139139
" model=\"gpt-5-mini\",\n",
140140
" api_key=os.environ[\"OPENAI_API_KEY\"],\n",
141-
" reasoning_effort=\"high\"\n",
141+
" reasoning_effort=\"low\",\n",
142+
" cache=True,\n",
142143
" )\n",
143-
"prompt = \"how many letters are in this prompt, only return the number.\"\n",
144-
"prompt = \"return and only return the prompt exactly\"\n",
145-
"prompt = \"waeuifgiufaiu liaisofeoidob ofbea df kdb vboae beoihffewafne nod In this prompt, where does the first p occur, using 0 indexing? only return the answer\"\n",
146-
"\n",
144+
"prompt = \"what is 5+4?\"\n",
147145
"response = llm_mini.invoke(prompt).content\n",
148146
"\n",
149-
"print(list(prompt).index(\"p\"))\n",
150147
"print(response)\n",
151148
"# print(len(response) == int(response))"
152149
]

0 commit comments

Comments
 (0)