Skip to content

Commit 3c8cb64

Browse files
committed
added a parser for the markdown
1 parent 76cbea1 commit 3c8cb64

File tree

6 files changed

+502
-195
lines changed

6 files changed

+502
-195
lines changed

conversion2025/assumptions.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ assumptions:
44
- parts are only 1 level deep (i.e. no Q1, part a), i)
55
- individual questions and solutions are seperatable by using just lines
66
- all parts are explicitly enumerated
7+
- Chunky Independent Maths are deperated (otherwise Mathpix will not be able to seperate them)
78

89

910
parts needs to be ordered

conversion2025/line_parser.lua

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
-- customer parser to parse input line by line except display math in a markdown file
2+
local pandoc = require("pandoc")
3+
4+
function Reader(input, reader_opts)
5+
local lines = {}
6+
input = tostring(input)
7+
print(input)
8+
for line in (input .. "\n"):gmatch("(.-)\n") do
9+
table.insert(lines, line)
10+
end
11+
12+
local blocks = {}
13+
local in_math = false
14+
local math_buffer = {}
15+
16+
for _, line in ipairs(lines) do
17+
-- matches "$$"
18+
if line:match("%$%$") then
19+
-- end of display math
20+
if in_math then
21+
table.insert(math_buffer, line)
22+
local math_content = table.concat(math_buffer, "\n")
23+
table.insert(blocks, pandoc.Para{pandoc.Math("DisplayMath", math_content)})
24+
math_buffer = {}
25+
in_math = false
26+
27+
-- start of display math
28+
else
29+
in_math = true
30+
math_buffer = {line}
31+
end
32+
33+
-- middle of display math
34+
elseif in_math then
35+
table.insert(math_buffer, line)
36+
37+
-- empty line
38+
elseif line:match("^%s*$") then
39+
-- skip empty lines
40+
41+
-- a regular line
42+
else
43+
table.insert(blocks, pandoc.Para{pandoc.Str(line)})
44+
end
45+
end
46+
47+
return pandoc.Pandoc(blocks)
48+
end

conversion2025/mathpix_to_llm_with_lines_to_api.ipynb

Lines changed: 87 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@
381381
"id": "16",
382382
"metadata": {},
383383
"source": [
384-
"# Extract Questions"
384+
"# Transform into markdown"
385385
]
386386
},
387387
{
@@ -390,6 +390,74 @@
390390
"id": "17",
391391
"metadata": {},
392392
"outputs": [],
393+
"source": [
394+
"# intermediate representation of the markdown\n",
395+
"class Markdown():\n",
396+
" def __init__(self, content):\n",
397+
" self.content = content\n",
398+
"\n",
399+
"class DisplayMath(Markdown):\n",
400+
" content = \"\"\n",
401+
" \n",
402+
" def __init__(self, content):\n",
403+
" super().__init__(content)\n",
404+
"\n",
405+
" def __str__(self):\n",
406+
" return f\"$$\\n{self.content}\\n$$\"\n",
407+
" \n",
408+
" def __repr__(self):\n",
409+
" return f\"DisplayMath({self.content!r})\"\n",
410+
"\n",
411+
"class RegularText(Markdown):\n",
412+
" def __init__(self, content):\n",
413+
" super().__init__(content)\n",
414+
"\n",
415+
" def __str__(self):\n",
416+
" return self.content\n",
417+
"\n",
418+
" def __repr__(self):\n",
419+
" return f\"RegularText({self.content!r})\"\n",
420+
"\n",
421+
"\n",
422+
"def markdown_to_classes(markdown: str) -> list[Markdown]:\n",
423+
" lines = markdown.split(\"\\n\")\n",
424+
" ret = []\n",
425+
" math_buffer = []\n",
426+
" displayMath = False\n",
427+
" for line in lines:\n",
428+
" if line == \"$$\":\n",
429+
" displayMath = not displayMath\n",
430+
" if not displayMath:\n",
431+
" ret.append(DisplayMath(\"\\n\".join(math_buffer)))\n",
432+
" math_buffer = []\n",
433+
" else:\n",
434+
" if displayMath:\n",
435+
" math_buffer.append(line)\n",
436+
" else:\n",
437+
" ret.append(RegularText(line))\n",
438+
" return ret\n",
439+
"\n",
440+
"def classes_to_markdown(classes: list[Markdown]) -> str:\n",
441+
" lines = []\n",
442+
" for c in classes:\n",
443+
" lines.append(str(c))\n",
444+
" return \"\\n\".join(lines)"
445+
]
446+
},
447+
{
448+
"cell_type": "markdown",
449+
"id": "18",
450+
"metadata": {},
451+
"source": [
452+
"# Extract Questions"
453+
]
454+
},
455+
{
456+
"cell_type": "code",
457+
"execution_count": null,
458+
"id": "19",
459+
"metadata": {},
460+
"outputs": [],
393461
"source": [
394462
"#define initial question model\n",
395463
"class QuestionModelLines(BaseModel):\n",
@@ -506,7 +574,7 @@
506574
},
507575
{
508576
"cell_type": "markdown",
509-
"id": "18",
577+
"id": "20",
510578
"metadata": {},
511579
"source": [
512580
"# extracting images from content"
@@ -515,7 +583,7 @@
515583
{
516584
"cell_type": "code",
517585
"execution_count": null,
518-
"id": "19",
586+
"id": "21",
519587
"metadata": {},
520588
"outputs": [],
521589
"source": [
@@ -531,7 +599,7 @@
531599
},
532600
{
533601
"cell_type": "markdown",
534-
"id": "20",
602+
"id": "22",
535603
"metadata": {},
536604
"source": [
537605
"# extracting questions form the problem sheet"
@@ -540,7 +608,7 @@
540608
{
541609
"cell_type": "code",
542610
"execution_count": null,
543-
"id": "21",
611+
"id": "23",
544612
"metadata": {},
545613
"outputs": [],
546614
"source": [
@@ -593,7 +661,7 @@
593661
{
594662
"cell_type": "code",
595663
"execution_count": null,
596-
"id": "22",
664+
"id": "24",
597665
"metadata": {},
598666
"outputs": [],
599667
"source": [
@@ -702,7 +770,7 @@
702770
{
703771
"cell_type": "code",
704772
"execution_count": null,
705-
"id": "23",
773+
"id": "25",
706774
"metadata": {},
707775
"outputs": [],
708776
"source": [
@@ -739,7 +807,7 @@
739807
},
740808
{
741809
"cell_type": "markdown",
742-
"id": "24",
810+
"id": "26",
743811
"metadata": {},
744812
"source": [
745813
"# Extract question parts and solutions"
@@ -748,7 +816,7 @@
748816
{
749817
"cell_type": "code",
750818
"execution_count": null,
751-
"id": "25",
819+
"id": "27",
752820
"metadata": {},
753821
"outputs": [],
754822
"source": [
@@ -807,7 +875,7 @@
807875
{
808876
"cell_type": "code",
809877
"execution_count": null,
810-
"id": "26",
878+
"id": "28",
811879
"metadata": {},
812880
"outputs": [],
813881
"source": [
@@ -837,7 +905,7 @@
837905
{
838906
"cell_type": "code",
839907
"execution_count": null,
840-
"id": "27",
908+
"id": "29",
841909
"metadata": {},
842910
"outputs": [],
843911
"source": [
@@ -1034,7 +1102,7 @@
10341102
},
10351103
{
10361104
"cell_type": "markdown",
1037-
"id": "28",
1105+
"id": "30",
10381106
"metadata": {},
10391107
"source": [
10401108
"# remove the duplicated text for single part questions"
@@ -1043,7 +1111,7 @@
10431111
{
10441112
"cell_type": "code",
10451113
"execution_count": null,
1046-
"id": "29",
1114+
"id": "31",
10471115
"metadata": {},
10481116
"outputs": [],
10491117
"source": [
@@ -1082,7 +1150,7 @@
10821150
{
10831151
"cell_type": "code",
10841152
"execution_count": null,
1085-
"id": "30",
1153+
"id": "32",
10861154
"metadata": {},
10871155
"outputs": [],
10881156
"source": [
@@ -1115,7 +1183,7 @@
11151183
{
11161184
"cell_type": "code",
11171185
"execution_count": null,
1118-
"id": "31",
1186+
"id": "33",
11191187
"metadata": {},
11201188
"outputs": [],
11211189
"source": [
@@ -1124,7 +1192,7 @@
11241192
},
11251193
{
11261194
"cell_type": "markdown",
1127-
"id": "32",
1195+
"id": "34",
11281196
"metadata": {},
11291197
"source": [
11301198
"# Displaying questions"
@@ -1133,7 +1201,7 @@
11331201
{
11341202
"cell_type": "code",
11351203
"execution_count": null,
1136-
"id": "33",
1204+
"id": "35",
11371205
"metadata": {},
11381206
"outputs": [],
11391207
"source": [
@@ -1160,7 +1228,7 @@
11601228
},
11611229
{
11621230
"cell_type": "markdown",
1163-
"id": "34",
1231+
"id": "36",
11641232
"metadata": {},
11651233
"source": [
11661234
"# in2lambda to JSON"
@@ -1169,7 +1237,7 @@
11691237
{
11701238
"cell_type": "code",
11711239
"execution_count": null,
1172-
"id": "35",
1240+
"id": "37",
11731241
"metadata": {},
11741242
"outputs": [],
11751243
"source": [

0 commit comments

Comments
 (0)