-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfunc.py
More file actions
97 lines (89 loc) · 3.16 KB
/
func.py
File metadata and controls
97 lines (89 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from io import StringIO
from utils import get_img_format
from typing import Callable, Awaitable
from bs4.element import NavigableString, Tag
from types import MappingProxyType as FrozenDict
from NoSuchTagException import NoSuchTagException
async def extract_paragraph(child: Tag) -> str:
assert child.name == "p"
result = ""
with StringIO() as string_io:
for c in child:
if isinstance(c, str) or isinstance(c, NavigableString):
string_io.write(c.get_text())
else:
await get_content(c, string_io)
result = string_io.getvalue()
return result
async def extract_list(child: Tag) -> str:
bullet, result, i = "∙ ", "", 0
assert child.name == "ol" or child.name == "ul"
with StringIO() as string_io:
for c in child:
if c == "\n": continue
if child.name == "ol":
i += 1
string_io.write(bullet if child.name == "ul" else f"{str(i)}. ")
await get_content(c, string_io)
string_io.write("\n")
result = string_io.getvalue()
return result
async def extract_img(child: Tag) -> str:
assert child.name == "img" and child['src'] != ""
return get_img_format(child['src'])
async def extract_code(child: Tag) -> str:
assert child.name == "pre"
result = ""
with StringIO() as string_io:
string_io.write("<code>\n")
for c in child:
if c == '\n': continue
if isinstance(c, NavigableString):
string_io.write(c.string)
else:
await get_content(c, string_io)
string_io.write("</code>\n")
result = string_io.getvalue()
return result
async def extract_list_item(child: Tag) -> str:
assert child.name == "li"
result = ""
with StringIO() as string_io:
for c in child:
if c == "\n": continue
await get_content(c, string_io)
result = string_io.getvalue()
return result
def mathjax_equivalent(html_tag: str, mathjax_tag: str, resolve_tag: bool = False) -> Callable[[Tag], Awaitable[str]]:
async def convert(child: Tag) -> str:
assert child.name == html_tag
if resolve_tag:
return mathjax_tag
return mathjax_tag.format(child.string)
return convert
jump_table = FrozenDict({
"p": extract_paragraph,
"ul": extract_list,
"ol": extract_list,
"img": extract_img,
"pre": extract_code,
"li": extract_list_item,
# Extra
"br": mathjax_equivalent("br", "\n", True),
"em": mathjax_equivalent("em", "\\text{{{0}}}"),
"strong": mathjax_equivalent("strong", "\\boldsymbol{{{0}}}"),
})
async def get_content(child: Tag, contentIO: StringIO) -> None:
# Skipping newlines
if child == "\n":
return
# NavigableString do not have `child.name` property to them
if isinstance(child, NavigableString):
contentIO.write(child.string)
elif child.name in jump_table:
str_content = await jump_table[child.name](child)
if len(str_content) == 0: return
contentIO.write(str_content)
contentIO.write("\n")
else:
raise NoSuchTagException(child.name)