forked from auth0/universal-login
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_to_markdown.py
More file actions
99 lines (73 loc) · 3 KB
/
convert_to_markdown.py
File metadata and controls
99 lines (73 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python3
"""
Convert HTML documentation files to Markdown format.
Removes specific elements and extracts the first h1 as frontmatter title.
"""
import os
from pathlib import Path
from bs4 import BeautifulSoup
from markdownify import markdownify as md
def main():
"""Main function to convert all HTML files in the docs directory."""
# Get the script directory
script_dir = Path(__file__).parent
docs_dir = script_dir
# Create output directory
output_dir = script_dir / "markdown_output"
output_dir.mkdir(exist_ok=True)
# Find all HTML files
html_files = list(docs_dir.glob("**/*.html"))
print(f"Found {len(html_files)} HTML files to convert")
converted_count = 0
for html_file in html_files:
try:
# Get relative path for better directory structure
rel_path = html_file.relative_to(docs_dir)
output_path = output_dir / rel_path.with_suffix(".mdx")
# Read HTML
with open(html_file, "r", encoding="utf-8") as f:
html_content = f.read()
# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Extract only the div.col-content if it exists
col_content = soup.find("div", {"class": "col-content"})
if col_content:
# Work with only the col-content div
working_content = col_content
else:
# Fallback to entire soup if col-content doesn't exist
working_content = soup
# Elements to decompose (remove)
elements_to_remove = ['ul.tsd-breadcrumb[aria-label="Breadcrumb"]']
# Remove specified elements
for selector in elements_to_remove:
for element in working_content.select(selector):
element.decompose()
# Extract first h1 for frontmatter
first_h1 = working_content.find("h1")
title = first_h1.get_text(strip=True) if first_h1 else "Untitled"
# Remove h1 from content
if first_h1:
first_h1.decompose()
# Convert to markdown
markdown_content = md(str(working_content), heading_style="ATX")
# Create frontmatter
frontmatter = f"""---
title: {title}
---
"""
# Combine
final_content = frontmatter + markdown_content
# Create output directory
output_path.parent.mkdir(parents=True, exist_ok=True)
# Write MDX file
with open(output_path, "w", encoding="utf-8") as f:
f.write(final_content)
converted_count += 1
print(f"Converted: {rel_path} -> {output_path.relative_to(output_dir)}")
except Exception as e:
print(f"Error converting {html_file}: {e}")
print(f"\nConversion complete! Converted {converted_count}/{len(html_files)} files")
print(f"Output directory: {output_dir}")
if __name__ == "__main__":
main()