removed duped text whenn question has a part and example for seperating questions

HarrySu123 · HarrySu123 · commit ccd738f43d90 · 2025-08-05T00:59:40.000+01:00
diff --git a/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb b/conversion2025/mathpix_to_llm_with_lines_to_api.ipynb
@@ -404,7 +404,8 @@
     "    questions: list[QuestionModelLines] = Field(..., description=\"A list of questions.\")\n",
     "\n",
     "llm_task_seperate_questions = \"\"\"\n",
-    "    Your task is to extract the line numbers for the start and end of each question and solution from the markdown file, then format it as a JSON object.\n",
+    "    Your task is to extract the line numbers for the start and end of all the question and solution from the markdown file, then format it as a JSON object.\n",
+    "    Note that the questions and solutions may not be around the same area in the markdown file.\n",
     "    These line numbers will be used later to extract the content of the questions and solutions procedurally.\n",
     "    \n",
     "    1.  **Content Extraction:**\n",
@@ -413,7 +414,7 @@
     "        -   Begin by identifying all the questions in the markdown file, and for each question:\n",
     "            -   Identify the start and end line numbers of the full question content, and place them in `question_content_start` and `question_content_end`.\n",
     "            -   Identify the start and end line numbers of the full relevant solution content, and place them in `solution_content_start` and `solution_content_end`.\n",
-    "            -   Be careful to ensure that everything related to the question and solution is included, including any math delimiters and LaTeX formatting.\n",
+    "            -   Be careful to ensure that everything related to the question and solution is included, including any math delimiters($, $$) and LaTeX formatting.\n",
     "            -   Do not forget to include any images or figures that are part of the question or solution.\n",
     "    \n",
     "    2.  **Output Format:**\n",
@@ -568,7 +569,7 @@
     "        \"\"\"\n",
     "        Initialize the Set_Question_With_Solution with a question and its solution.\n",
     "        \n",
-    "        Args:\n",
+    "        Args: \n",
     "            question (Set_Question): The question object.\n",
     "            solution (Set_Solution): The solution object.\n",
     "        \"\"\"\n",
@@ -625,12 +626,12 @@
     "llm_task_seperate_parts_question = r\"\"\"\n",
     "    1. **Content Extraction:**\n",
     "        -   You may choose the `title` for the question.\n",
-    "        -   From the input `Full Question Content`, identify the start line and end line for the main introductory text (the stem), place them in `content_start` and `content_end`. \n",
+    "        -   From the input `Full Question Content`, identify the start line and end line for the main introductory text (the stem), place them in `content_start` and `content_end`.\n",
     "        -   From the input `Full Question Content`, identify and separate all the `parts`(sub-questions), they could be explicit (e.g. using, \"(a)\", \"(b)\", \"i.\", \"ii.\"... etc.), but may also be implied. For each identified sub-question:\n",
     "            -   Place the start line going into `part_start` and the end line going into `part_end`.\n",
     "            -   If the question has no sub-questions, leave `part_start` as 0 and `part_end` as -1.\n",
     "            -   You may use the `Full Solution Content` to help with identifying the parts.\n",
-    "        -   Be careful to ensure that everything related to the question stem/parts is included, including any math delimiters and LaTeX formatting.\n",
+    "        -   Be careful to ensure that everything related to the question stem/parts is included, including any math delimiters($, $$) and LaTeX formatting.\n",
     "        -   Do not forget to include any images or figures that are part of the question stem, parts or solution.\n",
     "        -   Ensure no solution content is included in the `content` or `parts` fields.\n",
     "    \n",
@@ -639,13 +640,37 @@
     "        -   Do NOT include any explanations, comments, or markdown code blocks (like ```json).\n",
     "    \"\"\"\n",
     "\n",
+    "example_seperate_parts_question = r\"\"\"\n",
+    "    example:\n",
+    "    [(0, \"Q1. find value of $x$ in the following equation:\"),\n",
+    "    (1, \"i. $x + 1 = 2$\"),\n",
+    "    (2, \"ii. $x - 1 = 5$\")]\n",
+    "\n",
+    "    should be converted to:\n",
+    "    {\n",
+    "        \"title\": \"suitable title\",\n",
+    "        \"content_start\": 0,\n",
+    "        \"content_end\": 0,\n",
+    "        \"parts\": [\n",
+    "            {\n",
+    "                \"part_start\": 1,\n",
+    "                \"part_end\": 1\n",
+    "            },\n",
+    "            {\n",
+    "                \"part_start\": 2,\n",
+    "                \"part_end\": 2\n",
+    "            }\n",
+    "        ]\n",
+    "    }\n",
+    "    \"\"\"\n",
+    "\n",
     "llm_task_seperate_parts_solution = r\"\"\"\n",
     "    1. **Content Extraction:**\n",
     "        -   From the input `full solution content`, identify the specific solution part that corresponds to the `target question part`, and place the start line and end line into `part_solution_start` and `part_solution_end`.\n",
     "        -   If the `target question part` is empty, identify the specific solution part that corresponds to the `full question stem`.\n",
     "        -   Use the `full question stem` and `full question parts` to help identify the specific solution part.\n",
     "        -   Ensure that the `target question part` is used to extract the specific solution part.\n",
-    "        -   Be careful to ensure that everything related to the solution part is included, including any math delimiters and LaTeX formatting.\n",
+    "        -   Be careful to ensure that everything related to the solution part is included, including any math delimiters($, $$) and LaTeX formatting.\n",
     "        -   Do not forget to include any images or figures that are part of the solution.\n",
     "\n",
     "    2.  **Output Format:**\n",
@@ -672,6 +697,8 @@
     "\n",
     "        {llm_task_seperate_parts_question}\n",
     "\n",
+    "        {example_seperate_parts_question}\n",
+    "\n",
     "        Full Solution Content:\n",
     "        {solution_input}\n",
     "\n",
@@ -782,10 +809,57 @@
     "    ).model_dump()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "24",
+   "metadata": {},
+   "source": [
+    "# remove the duplicated text for single part questions"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "24",
+   "id": "25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# class NoPartsQuestionModel(BaseModel):\n",
+    "#     \"\"\"\n",
+    "#     Represents a question without parts.\n",
+    "#     \"\"\"\n",
+    "#     hasParts: bool = Field(False, description=\"Indicates if the question has parts.\")\n",
+    "\n",
+    "# llm_task_remove_dupe = \"\"\"\n",
+    "#     1.  **Task:**\n",
+    "#         -   Check if the single part that the question has is the same as the full question content.\n",
+    "#         -   If it is not, then remove the part and set `hasParts` to `False`.\n",
+    "#         -   If it is, then set `hasParts` to `True`.\n",
+    "        \n",
+    "#     2.  **Output Format:**\n",
+    "#         -   You MUST output ONLY a single, raw, valid JSON string that matches the provided schema.\n",
+    "#         -   Do NOT include any explanations, comments, or markdown code blocks (like ```json).\n",
+    "#     \"\"\"\n",
+    "# def llm_remove_dupe_part(content: str, part: str) -> bool:\n",
+    "#     return content == part\n",
+    "\n",
+    "def dupe_text_reduce(questions_dict: dict) -> dict:\n",
+    "    \"\"\"\n",
+    "    Reduces duplicate text in the questions content and its parts.\n",
+    "    \"\"\"\n",
+    "    for question in questions_dict[\"questions\"]:\n",
+    "        parts = question[\"parts\"]\n",
+    "        if len(parts) == 1 and parts[0] == question[\"content\"]:\n",
+    "            # If the only part is the same as the content, remove the part and set hasParts to False.\n",
+    "            question[\"parts\"][0] = \"\"\n",
+    "    \n",
+    "    return questions_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -815,16 +889,16 @@
     "\n",
     "    extracted_dict = extract_parts_question(questions_dict)\n",
     "    print(\"succesfully extracted the parts from the questions.\")\n",
-    "    print(json.dumps(extracted_dict, indent=2))\n",
+    "    print(json.dumps(extracted_dict))\n",
     "    print(\"Now validating the content...\")\n",
     "\n",
-    "    return extracted_dict"
+    "    return dupe_text_reduce(extracted_dict)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "25",
+   "id": "27",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -833,7 +907,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "26",
+   "id": "28",
    "metadata": {},
    "source": [
     "# Displaying questions"
@@ -842,7 +916,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "27",
+   "id": "29",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -869,7 +943,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "28",
+   "id": "30",
    "metadata": {},
    "source": [
     "# in2lambda to JSON"
@@ -878,7 +952,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "29",
+   "id": "31",
    "metadata": {},
    "outputs": [],
    "source": [