updated trimming prompt and removes double slashes from output

HarrySu123 · HarrySu123 · commit 6d4d51adda90 · 2025-09-10T17:30:12.000+01:00
diff --git a/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb b/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb
@@ -1291,20 +1291,21 @@
     "    end: str = Field(..., description=\"The end position of the trim.\")\n",
     "\n",
     "llm_task_trim_content = f\"\"\"\n",
-    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    You will be given the full text of a question, extracted from a markdown file using line numbers.\n",
+    "    Assuming the extracted text is correct, then only the start of the first and the end of last lines may contain unwanted text.\n",
     "    The first and last lines may contain unwanted text, such as:\n",
     "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
     "        - Text from the previous or next question.\n",
+    "    We want to remove this unwanted text.\n",
     "\n",
     "    Focus only on the actual stem (content) of the question.\n",
     "\n",
     "    Your task is to, using the full question as guidance:\n",
-    "        - From the first line, identify the exact substring where the stem begins, without the unwanted text, and put it in `start`.\n",
-    "        - From the last line, identify the exact substring where the stem ends, without the unwanted text, and put it in `end`.\n",
-    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
-    "        - Try to output as little as possible.\n",
-    "\n",
-    "    We assume that the middle of the stem is always correct, so only the start and end may need trimming.\n",
+    "        - First and Last line of the stem may be the same if the stem is only one line.\n",
+    "        - From the first line, identify the exact substring where the wanted text begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the wanted text ends and put it in `end`.\n",
+    "        - These two substrings will be used to find the start and end index using regex afterwards, so try to use as few words as possible.\n",
+    "        - Overlapping between start and end is allowed.\n",
     "\n",
     "    Example #1:\n",
     "        first line: \"1. A man is going up hill at 1m/s\"\n",
@@ -1315,20 +1316,21 @@
     "    \"\"\"\n",
     "\n",
     "llm_task_trim_part = f\"\"\"\n",
-    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    You will be given the full text of a question, extracted from a markdown file using line numbers.\n",
+    "    Assuming the extracted text is correct, then only the start of the first and the end of last lines may contain unwanted text.\n",
     "    The first and last lines may contain unwanted text, such as:\n",
     "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
     "        - Text from the previous or next question.\n",
+    "    We want to remove this unwanted text.\n",
     "\n",
     "    Focus only on one sub-question (part) of the question, specified later.\n",
     "\n",
     "    Your task is to, using the full question as guidance:\n",
-    "        - Identify the exact substring where the sub-question begins, without the unwanted text, and put it in `start`.\n",
-    "        - Identify the exact substring where the sub-question ends, without the unwanted text, and put it in `end`.\n",
-    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
-    "        - Try to output as little as possible.\n",
-    "\n",
-    "    We assume that the middle of the sub-question is always correct, so only the start and end may need trimming.\n",
+    "        - First and Last line of the sub-question may be the same if the sub-question is only one line.\n",
+    "        - From the first line, identify the exact substring where the wanted text begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the wanted text ends and put it in `end`.\n",
+    "        - These two substrings will be used to find the start and end index using regex afterwards, so try to use as few words as possible.\n",
+    "        - Overlapping between start and end is allowed.\n",
     "\n",
     "    Example #1:\n",
     "        first line: \"answer the following question: (a) what is his speed?\"\n",
@@ -1339,20 +1341,21 @@
     "    \"\"\"\n",
     "\n",
     "llm_task_trim_part_solution = f\"\"\"\n",
-    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    You will be given the full text of a question, extracted from a markdown file using line numbers.\n",
+    "    Assuming the extracted text is correct, then only the start of the first and the end of last lines may contain unwanted text.\n",
     "    The first and last lines may contain unwanted text, such as:\n",
     "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
     "        - Text from the previous or next question.\n",
+    "    We want to remove this unwanted text.\n",
     "\n",
-    "    Focus only on one part-solution of a sub-question of the question, specified later.\n",
+    "    Focus only on one part-solution of a sub-question (part) of the question, specified later.\n",
     "\n",
     "    Your task is to, using the full question as guidance:\n",
-    "        - Identify the exact substring where the part-solution begins, without the unwanted text, and put it in `start`.\n",
-    "        - Identify the exact substring where the part-solution ends, without the unwanted text, and put it in `end`.\n",
-    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
-    "        - Try to output as little as possible.\n",
-    "\n",
-    "    We assume that the middle of the part-solution is always correct, so only the start and end may need trimming.\n",
+    "        - First and Last line of the part-solution may be the same if the part-solution is only one line.\n",
+    "        - From the first line, identify the exact substring where the wanted text begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the wanted text ends and put it in `end`.\n",
+    "        - These two substrings will be used to find the start and end index using regex afterwards, so try to use as few words as possible.\n",
+    "        - Overlapping between start and end is allowed.\n",
     "\n",
     "    Example #1:\n",
     "        first line: \"A: (a) 2 + 3 = 5\"\n",
@@ -1384,7 +1387,7 @@
     "            Full question:\n",
     "            {question}\n",
     "\n",
-    "            Stem (content) of the question:\n",
+    "            Stem (content) of the question to extract from:\n",
     "            {content_text}\n",
     "\n",
     "            Return the JSON now.\n",
@@ -1428,7 +1431,7 @@
     "            Full question:\n",
     "            {question}\n",
     "\n",
-    "            specific sub-question (part) of the question:\n",
+    "            specific sub-question (part) of the question to extract from:\n",
     "            {part_text}\n",
     "\n",
     "            Return the JSON now.\n",
@@ -1472,7 +1475,7 @@
     "            Full question:\n",
     "            {question}\n",
     "\n",
-    "            Specific part-solution of the question:\n",
+    "            Specific part-solution of the question to extract from:\n",
     "            {solution_text}\n",
     "\n",
     "            Return the JSON now.\n",
@@ -1484,13 +1487,14 @@
     "\n",
     "            try:\n",
     "                parsed_output = solution_parser.parse(response.content)\n",
-    "                start = solution_text.index(parsed_output.start)\n",
-    "                end   = solution_text.index(parsed_output.end) + len(parsed_output.end)\n",
+    "                start = solution_text.index(parsed_output.start.replace('\\\\\\\\', '\\\\'))\n",
+    "                end   = solution_text.index(parsed_output.end.replace('\\\\\\\\', '\\\\')) + len(parsed_output.end)\n",
     "                print(f\"Successfully trimmed part-solution for question {question_number}, part {part_number}.\")\n",
     "\n",
     "                return improve_trim(solution_text, start, end)\n",
     "            except Exception as e:\n",
     "                print(f\"Error parsing LLM response as JSON for trimming solution part for question {question_number}, part {part_number}\")\n",
+    "                print(response.content)\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "\n",