Improved unwanted text removal to use regex-like strategy

HarrySu123 · HarrySu123 · commit 4ecf73e0d4ce · 2025-09-08T15:08:30.000+01:00
diff --git a/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb b/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb
@@ -1165,49 +1165,86 @@
    "outputs": [],
    "source": [
     "class TrimPosition(BaseModel):\n",
-    "    start: int = Field(..., description=\"The start position of the trim.\")\n",
-    "    end: int = Field(..., description=\"The end position of the trim.\")\n",
+    "    start: str = Field(..., description=\"The start position of the trim.\")\n",
+    "    end: str = Field(..., description=\"The end position of the trim.\")\n",
     "\n",
     "llm_task_trim_content = f\"\"\"\n",
-    "    A question has been split into its stem and parts.\n",
-    "    You will be giving a text from a question's stem, extracted from a markdown file by specifying line numbers to extract from.\n",
-    "    This means that the first and last lines may contain unwanted text, such as:\n",
-    "        -   Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
-    "        -   Text from the previous or next question.\n",
-    "    Your task is to identify and remove any unwanted text from the start and end of the stem.\n",
-    "    You will only ever remove text from the start and end of the stem.\n",
-    "    Provide the position of the first character on the first line that is wanted,\n",
-    "    and the position of the last character on the last line that is wanted.\n",
+    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    The first and last lines may contain unwanted text, such as:\n",
+    "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
+    "        - Text from the previous or next question.\n",
+    "\n",
+    "    Focus only on the actual stem (content) of the question.\n",
+    "\n",
+    "    Your task is to, using the full question as guidance:\n",
+    "        - From the first line, identify the exact substring where the stem begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the stem ends, and put it in `end`.\n",
+    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
+    "\n",
+    "    We assume that the middle of the stem is always correct, so only the start and end may need trimming.\n",
+    "\n",
+    "    Example #1:\n",
+    "        first line: \"1. A man is going up hill at 1m/s\"\n",
+    "        last line:  \"1. A man is going up hill at 1m/s\"\n",
+    "\n",
+    "        output:\n",
+    "        {{\"keep_first_line_from\": \"A man\",\"keep_last_line_until\": \"1m/s\"}}\n",
     "    \"\"\"\n",
     "\n",
     "llm_task_trim_part = f\"\"\"\n",
-    "    You will be giving the text of a question's sub-question, extracted from a markdown file by specifying line numbers to extract from.\n",
-    "    This means that the first and last lines may contain unwanted text, such as:\n",
-    "        -   Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
-    "        -   Text from the previous or next question/parts/solution.\n",
-    "    Your task is to identify and remove any unwanted text from the start and end of the part content.\n",
-    "    You will only ever remove text from the start and end of the part content.\n",
-    "    Provide the position of the first character on the first line that is wanted,\n",
-    "    and the position of the last character on the last line that is wanted.\n",
+    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    The first and last lines may contain unwanted text, such as:\n",
+    "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
+    "        - Text from the previous or next question.\n",
+    "\n",
+    "    Focus only on one sub-question (part) of the question, specified later.\n",
+    "\n",
+    "    Your task is to, using the full question as guidance:\n",
+    "        - From the first line, identify the exact substring where the sub-question begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the sub-question ends, and put it in `end`.\n",
+    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
+    "\n",
+    "    We assume that the middle of the sub-question is always correct, so only the start and end may need trimming.\n",
+    "\n",
+    "    Example #1:\n",
+    "        first line: \"answer the following question: (a) what is his speed?\"\n",
+    "        last line:  \"answer the following question: (a) what is his speed?\"\n",
+    "\n",
+    "        output:\n",
+    "        {{\"keep_first_line_from\": \"what\",\"keep_last_line_until\": \"speed?\"}}\n",
     "    \"\"\"\n",
     "\n",
     "llm_task_trim_part_solution = f\"\"\"\n",
-    "    You will be giving the text of a question's sub-question's solution, extracted from a markdown file by specifying line numbers to extract from.\n",
-    "    This means that the first and last lines may contain unwanted text, such as:\n",
-    "        -   Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\" ... etc.)\n",
-    "        -   Text from the previous or next question/parts/solution.\n",
-    "    Your task is to identify and remove any unwanted text from the start and end of the solution content.\n",
-    "    You will only ever remove text from the start and end of the solution content.\n",
-    "    Provide the position of the first character on the first line that is wanted,\n",
-    "    and the position of the last character on the last line that is wanted.\n",
+    "    You will be given the full text of a question, extracted from a markdown file by line numbers.\n",
+    "    The first and last lines may contain unwanted text, such as:\n",
+    "        - Question numbering (e.g. \"1.\", \"2.\", \"(a)\", \"(b)\", \"i.\", \"ii.\", etc.)\n",
+    "        - Text from the previous or next question.\n",
+    "\n",
+    "    Focus only on one part-solution of a sub-question of the question, specified later.\n",
+    "\n",
+    "    Your task is to, using the full question as guidance:\n",
+    "        - From the first line, identify the exact substring where the part-solution begins, and put it in `start`.\n",
+    "        - From the last line, identify the exact substring where the part-solution ends, and put it in `end`.\n",
+    "        - Ensure that the substrings are taken verbatim from the original text, so they can be located precisely in Python code.\n",
+    "\n",
+    "    We assume that the middle of the part-solution is always correct, so only the start and end may need trimming.\n",
+    "\n",
+    "    Example #1:\n",
+    "        first line: \"A: (a) 2 + 3 = 5\"\n",
+    "        last line:  \"A: (a) 2 + 3 = 5\"\n",
+    "\n",
+    "        output:\n",
+    "        {{\"keep_first_line_from\": \"A:\",\"keep_last_line_until\": \"= 5\"}}\n",
     "    \"\"\"\n",
     "\n",
     "\n",
     "def trim_question(question: Set_Question_With_Solution) -> Set_Question_With_Solution:\n",
     "\n",
     "    def trim_question_content(content_text: str) -> str:\n",
+    "        if content_text == \"\":\n",
+    "            return content_text\n",
     "\n",
-    "        content_text = content_text.split(\"\\n\")\n",
+    "        # split_content = content_text.split(\"\\n\")\n",
     "        content_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
     "\n",
     "        trim_content_prompt = f\"\"\"\n",
@@ -1219,11 +1256,10 @@
     "            Full question:\n",
     "            {question}\n",
     "\n",
-    "            First Line of the content:\n",
-    "            {enumerate(content_text[0])}\n",
+    "            Stem (content) of the question:\n",
+    "            {content_text}\n",
     "\n",
-    "            Last Line of the content:\n",
-    "            {enumerate(content_text[-1])}\n",
+    "            Return the JSON now.\n",
     "            \"\"\"\n",
     "        \n",
     "        for attempt_idx in range(3):\n",
@@ -1232,32 +1268,34 @@
     "\n",
     "            try:\n",
     "                parsed_output = content_parser.parse(response.content)\n",
-    "                start = parsed_output.start\n",
-    "                end = parsed_output.end\n",
-    "                print(start, end)\n",
-    "                print(content_text[0])\n",
-    "                print(content_text[-1])\n",
+    "                start = content_text[parsed_output.start]\n",
+    "                end = content_text[parsed_output.end]\n",
     "\n",
-    "                if start < 0 or end >= len(content_text) or start > end:\n",
+    "                if start < 0 or start >= len(split_content[0]) or end < 0 or end >= len(split_content[-1]):\n",
     "                    raise Exception(\"Invalid trim positions.\")\n",
     "                \n",
-    "                content_text[0] = content_text[0][start:]\n",
-    "                content_text[-1] = content_text[-1][:end + 1]\n",
+    "                split_content[0] = split_content[0][start:]\n",
+    "                split_content[-1] = split_content[-1][:end + 1]\n",
     "                print(\"LLM response successfully parsed trim positions.\")\n",
     "\n",
-    "                return \"\\n\".join(content_text).strip()\n",
+    "                return \"\\n\".join(split_content).strip()\n",
     "            except Exception as e:\n",
     "                print(f\"Error parsing LLM response as JSON for trimming content:\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "        else:\n",
     "            print(\"Final LLM Response:\")\n",
     "            print(response.content)\n",
+    "            print(\"Full content:\" , content_text)\n",
+    "            print(\"length of first line:\", len(split_content[0]))\n",
+    "            print(\"length of last line:\", len(split_content[-1]))\n",
     "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming content.\")\n",
     "\n",
     "    def trim_question_part(part_text: str) -> str:\n",
+    "        if part_text == \"\":\n",
+    "            return part_text\n",
     "        \n",
-    "        part_text = part_text.split(\"\\n\")\n",
+    "        split_part = part_text.split(\"\\n\")\n",
     "        part_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
     "\n",
     "        trim_part_prompt = f\"\"\"\n",
@@ -1270,10 +1308,12 @@
     "            {question}\n",
     "\n",
     "            First Line of the part:\n",
-    "            {enumerate(part_text[0])}\n",
+    "            {split_part[0]}\n",
     "\n",
     "            Last Line of the part:\n",
-    "            {enumerate(part_text[-1])}\n",
+    "            {split_part[-1]}\n",
+    "\n",
+    "            Return the JSON now.\n",
     "            \"\"\"\n",
     "        \n",
     "        for attempt_idx in range(3):\n",
@@ -1285,26 +1325,31 @@
     "                start = parsed_output.start\n",
     "                end = parsed_output.end\n",
     "\n",
-    "                if start < 0 or end >= len(part_text) or start > end:\n",
+    "                if start < 0 or start >= len(split_part[0]) or end < 0 or end >= len(split_part[-1]):\n",
     "                    raise Exception(\"Invalid trim positions.\")\n",
     "                \n",
-    "                part_text[0] = part_text[0][start:]\n",
-    "                part_text[-1] = part_text[-1][:end + 1]\n",
+    "                split_part[0] = split_part[0][start:]\n",
+    "                split_part[-1] = split_part[-1][:end + 1]\n",
     "                print(\"LLM response successfully parsed trim positions.\")\n",
     "\n",
-    "                return \"\\n\".join(part_text).strip()\n",
+    "                return \"\\n\".join(split_part).strip()\n",
     "            except Exception as e:\n",
     "                print(f\"Error parsing LLM response as JSON for trimming part:\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "        else:\n",
     "            print(\"Final LLM Response:\")\n",
     "            print(response.content)\n",
+    "            print(\"Full part:\" , part_text)\n",
+    "            print(\"length of first line:\", len(split_part[0]))\n",
+    "            print(\"length of last line:\", len(split_part[-1]))\n",
     "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming part.\")\n",
     "\n",
     "    def trim_question_part_solution(solution_text: str) -> str:\n",
+    "        if solution_text == \"\":\n",
+    "            return solution_text\n",
     "        \n",
-    "        solution_text = solution_text.split(\"\\n\")\n",
+    "        split_solution = solution_text.split(\"\\n\")\n",
     "        solution_parser = PydanticOutputParser(pydantic_object=TrimPosition)\n",
     "\n",
     "        trim_solution_prompt = f\"\"\"\n",
@@ -1317,10 +1362,12 @@
     "            {question}\n",
     "\n",
     "            First Line of the solution part:\n",
-    "            {enumerate(solution_text[0])}\n",
+    "            {split_solution[0]}\n",
     "\n",
     "            Last Line of the solution part:\n",
-    "            {enumerate(solution_text[-1])}\n",
+    "            {split_solution[-1]}\n",
+    "\n",
+    "            Return the JSON now.\n",
     "            \"\"\"\n",
     "        \n",
     "        for attempt_idx in range(3):\n",
@@ -1332,14 +1379,14 @@
     "                start = parsed_output.start\n",
     "                end = parsed_output.end\n",
     "\n",
-    "                if start < 0 or end >= len(solution_text) or start > end:\n",
+    "                if start < 0 or start >= len(split_solution[0]) or end < 0 or end >= len(split_solution[-1]):\n",
     "                    raise Exception(\"Invalid trim positions.\")\n",
     "                \n",
-    "                solution_text[0] = solution_text[0][start:]\n",
-    "                solution_text[-1] = solution_text[-1][:end + 1]\n",
+    "                split_solution[0] = split_solution[0][start:]\n",
+    "                split_solution[-1] = split_solution[-1][:end + 1]\n",
     "                print(\"LLM response successfully parsed trim positions.\")\n",
     "\n",
-    "                return \"\\n\".join(solution_text).strip()\n",
+    "                return \"\\n\".join(split_solution).strip()\n",
     "            except Exception as e:\n",
     "                print(f\"Error parsing LLM response as JSON for trimming solution part:\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
@@ -1348,6 +1395,9 @@
     "        else:\n",
     "            print(\"Final LLM Response:\")\n",
     "            print(response.content)\n",
+    "            print(\"Full solution part:\" , solution_text)\n",
+    "            print(\"length of first line:\", len(split_solution[0]))\n",
+    "            print(\"length of last line:\", len(split_solution[-1]))\n",
     "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming solution part.\")\n",
     "\n",
     "    question.content = trim_question_content(question.content)\n",
diff --git a/conversion2025/testing_and_prototype.ipynb b/conversion2025/testing_and_prototype.ipynb
@@ -138,15 +138,12 @@
     "llm_mini = ChatOpenAI(\n",
     "            model=\"gpt-5-mini\",\n",
     "            api_key=os.environ[\"OPENAI_API_KEY\"],\n",
-    "            reasoning_effort=\"high\"\n",
+    "            reasoning_effort=\"low\",\n",
+    "            cache=True,\n",
     "        )\n",
-    "prompt = \"how many letters are in this prompt, only return the number.\"\n",
-    "prompt = \"return and only return the prompt exactly\"\n",
-    "prompt = \"waeuifgiufaiu liaisofeoidob ofbea df kdb vboae beoihffewafne nod In this prompt, where does the first p occur, using 0 indexing? only return the answer\"\n",
-    "\n",
+    "prompt = \"what is 5+4?\"\n",
     "response = llm_mini.invoke(prompt).content\n",
     "\n",
-    "print(list(prompt).index(\"p\"))\n",
     "print(response)\n",
     "# print(len(response) == int(response))"
    ]