implemented correct smart trimming

HarrySu123 · HarrySu123 · commit 06cd767722d8 · 2025-09-09T15:40:09.000+01:00
diff --git a/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb b/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb
@@ -400,6 +400,7 @@
     "class InlineMath(Markdown):\n",
     "    def __init__(self, content):\n",
     "        super().__init__(content)\n",
+    "        self.delimiter_size = 2\n",
     "\n",
     "    def __str__(self):\n",
     "        return f\"InlineMath({self.content!r})\"\n",
@@ -411,6 +412,7 @@
     "class DisplayMath(Markdown):    \n",
     "    def __init__(self, content):\n",
     "        super().__init__(content)\n",
+    "        self.delimiter_size = 4\n",
     "\n",
     "    def __str__(self):\n",
     "        return f\"DisplayMath({self.content!r})\"\n",
@@ -1231,25 +1233,49 @@
     "# but overall the position should be fairly accurate.\n",
     "\n",
     "def improve_trim(text: str, start: int, end: int) -> str:\n",
-    "    markdown_classes = convert_markdown_to_classes_by_lines(text)\n",
-    "    index = 0\n",
+    "    markdown_classes = convert_markdown_to_classes(text)\n",
+    "    # print(markdown_classes)\n",
+    "    text_index = 0\n",
+    "    class_index = 0\n",
+    "    improved_start = -1\n",
+    "    improved_end   = -1\n",
     "\n",
-    "    for i in range(len(markdown_classes)):\n",
-    "        structure = markdown_classes[i]\n",
+    "    while class_index < len(markdown_classes):\n",
+    "        structure = markdown_classes[class_index]\n",
     "\n",
     "        match structure:\n",
     "            case RegularText():\n",
-    "                if len(structure.content) + index < start:\n",
-    "                    # start is not in this structure\n",
-    "                    index += len(structure.content) + 1\n",
-    "                    continue\n",
-    "                else:\n",
-    "                    # start is in this structure\n",
-    "                    structure_length = len(structure.content)\n",
-    "                    structure.content = structure.content[start - index:]\n",
-    "                    index += structure_length + 1\n",
-    "                    continue\n",
-    "    return \"\"\n",
+    "                structure_length = len(structure.content)\n",
+    "                temp_improved_start = 0\n",
+    "                temp_improved_end = len(structure.content)\n",
+    "                if text_index <= start and structure_length + text_index > start:\n",
+    "                    improved_start = class_index\n",
+    "                    temp_improved_start = start - text_index\n",
+    "\n",
+    "                if text_index <= end and structure_length + text_index >= end:\n",
+    "                    improved_end = class_index\n",
+    "                    temp_improved_end = end - text_index\n",
+    "                \n",
+    "                structure.content = structure.content[temp_improved_start:temp_improved_end]\n",
+    "                text_index += structure_length\n",
+    "\n",
+    "            case InlineMath() | DisplayMath():\n",
+    "                structure_length = len(structure.content) + structure.delimiter_size\n",
+    "                if text_index <= start and structure_length + text_index > start:\n",
+    "                    improved_start = class_index\n",
+    "                if text_index <= end and structure_length + text_index >= end:\n",
+    "                    improved_end = class_index\n",
+    "                text_index += structure_length\n",
+    "                \n",
+    "        class_index += 1\n",
+    "\n",
+    "\n",
+    "    ret = markdown_classes[improved_start:improved_end + 1]\n",
+    "    # print(ret)\n",
+    "    # print(len(text), start, end)\n",
+    "    # print(improved_start, improved_end)\n",
+    "\n",
+    "    return convert_classes_to_markdown(ret)\n",
     "\n"
    ]
   },
@@ -1337,9 +1363,12 @@
     "    \"\"\"\n",
     "\n",
     "\n",
-    "def trim_question(question: Set_Question_With_Solution) -> Set_Question_With_Solution:\n",
+    "def trim_question(question: tuple[int, Set_Question_With_Solution]) -> Set_Question_With_Solution:\n",
+    "    question_number, question = question\n",
+    "    question_number += 1\n",
     "\n",
     "    def trim_question_content(content_text: str) -> str:\n",
+    "\n",
     "        if content_text == \"\":\n",
     "            return content_text\n",
     "\n",
@@ -1368,20 +1397,22 @@
     "            try:\n",
     "                parsed_output = content_parser.parse(response.content)\n",
     "                start = content_text.index(parsed_output.start)\n",
-    "                end   = content_text.index(parsed_output.end)\n",
-    "                print(\"Successfully trimmed the stem.\")\n",
+    "                end   = content_text.index(parsed_output.end) + len(parsed_output.end)\n",
+    "                print(f\"Successfully trimmed the stem of question {question_number}.\")\n",
     "\n",
-    "                return content_text[start:end + len(parsed_output.end) + 1].strip()\n",
+    "                return improve_trim(content_text, start, end)\n",
     "            except Exception as e:\n",
-    "                print(f\"Error parsing LLM response as JSON for trimming content:\")\n",
+    "                print(f\"Error parsing LLM response as JSON for trimming content of question {question_number}:\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "        else:\n",
     "            print(\"Final LLM Response:\")\n",
     "            print(response.content)\n",
     "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming content.\")\n",
     "\n",
-    "    def trim_question_part(part_text: str) -> str:\n",
+    "    def trim_question_part(part: tuple[int, str]) -> str:\n",
+    "        part_number, part_text = part\n",
+    "        part_number += 1\n",
     "        if part_text == \"\":\n",
     "            return part_text\n",
     "        \n",
@@ -1410,20 +1441,22 @@
     "            try:\n",
     "                parsed_output = part_parser.parse(response.content)\n",
     "                start = part_text.index(parsed_output.start)\n",
-    "                end   = part_text.index(parsed_output.end)\n",
-    "                print(\"Successfully trimmed part\")\n",
+    "                end   = part_text.index(parsed_output.end) + len(parsed_output.end)\n",
+    "                print(f\"Successfully trimmed part of question {question_number}, part {part_number}.\")\n",
     "\n",
-    "                return part_text[start:end + len(parsed_output.end) + 1].strip()\n",
+    "                return improve_trim(part_text, start, end)\n",
     "            except Exception as e:\n",
-    "                print(f\"Error parsing LLM response as JSON for trimming part:\")\n",
+    "                print(f\"Error parsing LLM response as JSON for trimming part for question {question_number}, part {part_number}\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "        else:\n",
     "            print(\"Final LLM Response:\")\n",
     "            print(response.content)\n",
     "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts for trimming part.\")\n",
     "\n",
-    "    def trim_question_part_solution(solution_text: str) -> str:\n",
+    "    def trim_question_part_solution(solution: tuple[int, str]) -> str:\n",
+    "        part_number, solution_text = solution\n",
+    "        part_number += 1\n",
     "        if solution_text == \"\":\n",
     "            return solution_text\n",
     "        \n",
@@ -1452,12 +1485,12 @@
     "            try:\n",
     "                parsed_output = solution_parser.parse(response.content)\n",
     "                start = solution_text.index(parsed_output.start)\n",
-    "                end   = solution_text.index(parsed_output.end)\n",
-    "                print(\"Successfully trimmed part-solution.\")\n",
+    "                end   = solution_text.index(parsed_output.end) + len(parsed_output.end)\n",
+    "                print(f\"Successfully trimmed part-solution for question {question_number}, part {part_number}.\")\n",
     "\n",
-    "                return solution_text[start:end + len(parsed_output.end) + 1].strip()\n",
+    "                return improve_trim(solution_text, start, end)\n",
     "            except Exception as e:\n",
-    "                print(f\"Error parsing LLM response as JSON for trimming solution part:\")\n",
+    "                print(f\"Error parsing LLM response as JSON for trimming solution part for question {question_number}, part {part_number}\")\n",
     "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "\n",
@@ -1469,15 +1502,15 @@
     "    question.content = trim_question_content(question.content)\n",
     "\n",
     "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
-    "        question.parts = list(executor.map(trim_question_part, question.parts))\n",
-    "        question.parts_solutions = list(executor.map(trim_question_part_solution, question.parts_solutions))\n",
+    "        question.parts = list(executor.map(trim_question_part, enumerate(question.parts)))\n",
+    "        question.parts_solutions = list(executor.map(trim_question_part_solution, enumerate(question.parts_solutions)))\n",
     "\n",
     "    return question\n",
     "\n",
     "def trim_text(set_questions: Set_Lines) -> Set_Lines:\n",
     "\n",
     "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
-    "        set_questions.questions = list(executor.map(trim_question, set_questions.questions))\n",
+    "        set_questions.questions = list(executor.map(trim_question, enumerate(set_questions.questions)))\n",
     "\n",
     "    return set_questions\n"
    ]