Skip to content

Commit be7d458

Browse files
committed
converter now enumerates over custom python classes to help with not missing out on display amth delimiters
1 parent 3c8cb64 commit be7d458

File tree

1 file changed

+14
-14
lines changed

1 file changed

+14
-14
lines changed

conversion2025/mathpix_to_llm_with_lines_to_api.ipynb

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@
541541
" \"\"\"\n",
542542
"\n",
543543
"# Prompt for the LLM to extract questions.\n",
544-
"def seperate_questions_prompt(parser: PydanticOutputParser[AllQuestionsModelLines], doc_page_content: list[str]) -> str: #, previous_repsonse: str = \"\", improvements: list[str] = \"\") -> str:\n",
544+
"def seperate_questions_prompt(parser: PydanticOutputParser[AllQuestionsModelLines], doc_page_content: list[Markdown]) -> str: #, previous_repsonse: str = \"\", improvements: list[str] = \"\") -> str:\n",
545545
"\n",
546546
" feedback = \"\"\n",
547547
" # if previous_repsonse:\n",
@@ -624,7 +624,7 @@
624624
" questions: list[QuestionModel] = Field(..., description=\"A list of questions.\")\n",
625625
"\n",
626626
"\n",
627-
"def extract_questions(allQuestionsModel: AllQuestionsModelLines, doc_page_content: list[str]) -> AllQuestionsModel:\n",
627+
"def extract_questions(allQuestionsModel: AllQuestionsModelLines, doc_page_content: list[Markdown]) -> AllQuestionsModel:\n",
628628
" \"\"\"\n",
629629
" Extracts questions from the AllQuestions model and returns a list of Question objects.\n",
630630
" \"\"\"\n",
@@ -635,8 +635,8 @@
635635
" questions = []\n",
636636
"\n",
637637
" for question in allQuestionsModel.questions:\n",
638-
" question_content = \"\\n\".join(doc_page_content[question.question_content_start:question.question_content_end+1])\n",
639-
" solution_content = \"\\n\".join(doc_page_content[question.solution_content_start:question.solution_content_end+1])\n",
638+
" question_content = classes_to_markdown(doc_page_content[question.question_content_start:question.question_content_end+1])\n",
639+
" solution_content = classes_to_markdown(doc_page_content[question.solution_content_start:question.solution_content_end+1])\n",
640640
" #important, image will be wrong if two identical images are used, although this should not be possible.\n",
641641
" images = list(set(extract_images(question_content) + extract_images(solution_content)))\n",
642642
"\n",
@@ -774,7 +774,7 @@
774774
"metadata": {},
775775
"outputs": [],
776776
"source": [
777-
"def llm_extract_questions_lines(markdown: list[str]) -> dict:\n",
777+
"def llm_extract_questions_lines(markdown: list[Markdown]) -> dict:\n",
778778
" print(\"Begining to seperate the questions from the markdown content...\")\n",
779779
" \n",
780780
" # Initialise the parser for the output.\n",
@@ -879,24 +879,24 @@
879879
"metadata": {},
880880
"outputs": [],
881881
"source": [
882-
"def convert_set_question_lines_to_set_question(set_question_lines: Set_Question_Lines, question_content: list[str], images: list[str] = []) -> Set_Question:\n",
882+
"def convert_set_question_lines_to_set_question(set_question_lines: Set_Question_Lines, question_content: list[Markdown], images: list[str] = []) -> Set_Question:\n",
883883
" \"\"\"\n",
884884
" Convert Set_Question_Lines to Set_Question.\n",
885885
" \"\"\"\n",
886886
" return Set_Question(\n",
887887
" title=set_question_lines.title,\n",
888-
" content=\"\\n\".join(question_content[set_question_lines.content_start:set_question_lines.content_end + 1]),\n",
889-
" parts=[\"\\n\".join(question_content[part.part_start:part.part_end + 1]) for part in set_question_lines.parts],\n",
888+
" content=classes_to_markdown(question_content[set_question_lines.content_start:set_question_lines.content_end + 1]),\n",
889+
" parts=[classes_to_markdown(question_content[part.part_start:part.part_end + 1]) for part in set_question_lines.parts],\n",
890890
" images=images\n",
891891
" )\n",
892892
"\n",
893-
"def convert_set_solution_lines_to_set_solution(set_solution_lines: list[Set_Solution_Part_Lines], solution_content: list[str]) -> Set_Solution:\n",
893+
"def convert_set_solution_lines_to_set_solution(set_solution_lines: list[Set_Solution_Part_Lines], solution_content: list[Markdown]) -> Set_Solution:\n",
894894
" \"\"\"\n",
895895
" Convert Set_Solution_Part_Lines to Set_Solution.\n",
896896
" \"\"\"\n",
897897
" return Set_Solution(\n",
898898
" parts_solutions=[\n",
899-
" \"\\n\".join(solution_content[part.part_solution_start:part.part_solution_end + 1])\n",
899+
" classes_to_markdown(solution_content[part.part_solution_start:part.part_solution_end + 1])\n",
900900
" for part in set_solution_lines\n",
901901
" ]\n",
902902
" )\n"
@@ -972,7 +972,7 @@
972972
" # Initialize the output parser with the Set_Question schema.\n",
973973
" question_parser = PydanticOutputParser(pydantic_object=Set_Question_Lines)\n",
974974
"\n",
975-
" question_input: list[str] = question[\"question_content\"].splitlines()\n",
975+
" question_input: list[Markdown] = markdown_to_classes(question[\"question_content\"])\n",
976976
" solution_input: str = question[\"solution_content\"]\n",
977977
" all_images = question[\"images\"]\n",
978978
"\n",
@@ -1023,7 +1023,7 @@
10231023
" part_idx, part = part_data\n",
10241024
" solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part_Lines)\n",
10251025
"\n",
1026-
" target_solution_input: list[str] = solution_input.splitlines()\n",
1026+
" target_solution_input: list[Markdown] = markdown_to_classes(solution_input)\n",
10271027
"\n",
10281028
" # Prompt for the LLM to extract The solution part.\n",
10291029
" # Use the full solution content and the part to extract the specific solution.\n",
@@ -1071,7 +1071,7 @@
10711071
"\n",
10721072
" solutions_parts = convert_set_solution_lines_to_set_solution(\n",
10731073
" solutions_parts, \n",
1074-
" solution_input.splitlines()\n",
1074+
" markdown_to_classes(solution_input)\n",
10751075
" )\n",
10761076
"\n",
10771077
" # set_solution = Set_Solution(parts_solutions=solutions_parts)\n",
@@ -1166,7 +1166,7 @@
11661166
" If parsing fails, returns None.\n",
11671167
" \"\"\"\n",
11681168
"\n",
1169-
" md_content_lines = md_content.splitlines()\n",
1169+
" md_content_lines = markdown_to_classes(md_content)\n",
11701170
"\n",
11711171
" # corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
11721172
" # print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",

0 commit comments

Comments
 (0)