-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_to_csv_converter.py
More file actions
314 lines (254 loc) · 11.6 KB
/
html_to_csv_converter.py
File metadata and controls
314 lines (254 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
"""
HTML Book Notes to CSV Converter
Converts ereader book notes from HTML format to CSV for Readwise import
"""
import os
import re
import csv
import argparse
from datetime import datetime
from pathlib import Path
from bs4 import BeautifulSoup
from typing import List, Dict, Optional
class BookNotesConverter:
def __init__(self):
self.csv_headers = [
'Highlight',
'Title',
'Author',
'URL',
'Note',
'Location',
'Location Type',
'Date'
]
def extract_title_author(self, soup: BeautifulSoup) -> tuple[str, str]:
"""Extract book title and author from the H2 tag"""
h2_tag = soup.find('h2')
if not h2_tag:
return "", ""
title_author = h2_tag.get_text().strip()
# Split on ' - ' to separate title and author
if ' - ' in title_author:
parts = title_author.rsplit(' - ', 1) # rsplit to handle titles with dashes
title = parts[0].strip()
author = parts[1].strip()
else:
title = title_author
author = ""
return title, author
def parse_date(self, date_str: str) -> str:
"""Parse date string and convert to YYYY-MM-DD HH:MM:SS format"""
try:
# Expected format: "2025-10-12 19:38"
date_str = date_str.strip()
# Handle case where only date is provided (add default time)
if len(date_str) == 10: # YYYY-MM-DD
date_str += " 00:00"
# Parse the datetime
dt = datetime.strptime(date_str, "%Y-%m-%d %H:%M")
return dt.strftime("%Y-%m-%d %H:%M:%S")
except ValueError:
# If parsing fails, return the original string
return date_str
def extract_highlights(self, soup: BeautifulSoup, title: str, author: str) -> List[Dict[str, str]]:
"""Extract all highlights from the HTML"""
highlights = []
# Find all highlight containers (divs with padding-top and padding-bottom)
highlight_divs = soup.find_all('div', style=lambda x: x and 'padding-top: 1em; padding-bottom: 1em' in x)
current_chapter = ""
location_counter = 1
for div in highlight_divs:
# Check if this div contains a chapter header
chapter_span = div.find('span', style=lambda x: x and 'color: #48b4c1' in x and 'font-weight: bold' in x)
if chapter_span:
current_chapter = chapter_span.get_text().strip()
continue
# Extract date
date_div = div.find('div', style=lambda x: x and 'border-left: 5px solid rgb(237,108,0)' in x)
date_str = ""
if date_div:
date_str = self.parse_date(date_div.get_text().strip())
# Extract highlight text (the main content div)
highlight_div = div.find('div', style=lambda x: x and 'font-size: 12pt' in x and 'border-left' not in (x or ''))
if not highlight_div:
continue
highlight_text = highlight_div.get_text().strip()
if not highlight_text:
continue
# Extract note from the remark table
note = ""
table = div.find('table')
if table:
# Find the second td which contains the user's note
tds = table.find_all('td')
if len(tds) >= 2:
note_text = tds[1].get_text().strip()
# Only include non-default notes (not "Underline notes")
if note_text and note_text != "Underline notes":
note = note_text
# Create the highlight entry
highlight_entry = {
'Highlight': highlight_text,
'Title': title,
'Author': author,
'URL': '', # No URL for books
'Note': note,
'Location': str(location_counter),
'Location Type': 'order',
'Date': date_str
}
highlights.append(highlight_entry)
location_counter += 1
# Reverse order: HTML has newest/later chapters first, we want chronological order
highlights.reverse()
for i, h in enumerate(highlights, 1):
h['Location'] = str(i)
return highlights
def convert_html_file(self, html_file_path: str) -> List[Dict[str, str]]:
"""Convert a single HTML file to highlight data"""
try:
with open(html_file_path, 'r', encoding='utf-8') as file:
content = file.read()
soup = BeautifulSoup(content, 'html.parser')
# Extract title and author
title, author = self.extract_title_author(soup)
# Extract highlights
highlights = self.extract_highlights(soup, title, author)
return highlights
except Exception as e:
print(f"Error processing {html_file_path}: {str(e)}")
return []
def convert_to_csv(self, html_files: List[str], output_csv: str):
"""Convert multiple HTML files to a single CSV file"""
all_highlights = []
for html_file in html_files:
print(f"Processing: {html_file}")
highlights = self.convert_html_file(html_file)
all_highlights.extend(highlights)
print(f" Found {len(highlights)} highlights")
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=self.csv_headers)
writer.writeheader()
writer.writerows(all_highlights)
print(f"\nConversion complete!")
print(f"Total highlights: {len(all_highlights)}")
print(f"Output file: {output_csv}")
return all_highlights
def batch_process_folder(self, html_folder: str = "html-files", output_folder: str = "output"):
"""
Batch process all HTML files in the html-files folder.
Creates individual CSV files for each book and a combined CSV file.
"""
html_folder_path = Path(html_folder)
output_folder_path = Path(output_folder)
# Create output folder if it doesn't exist
output_folder_path.mkdir(exist_ok=True)
# Find all HTML files
html_files = list(html_folder_path.glob('*.html'))
if not html_files:
print(f"No HTML files found in '{html_folder}' folder.")
print(f"Please place your HTML book note files in the '{html_folder}' folder and run again.")
return
print(f"Found {len(html_files)} HTML files to process:")
for file in html_files:
print(f" - {file.name}")
print()
all_combined_highlights = []
individual_files_created = []
# Process each file individually
for html_file in html_files:
# Create a safe filename for the CSV
book_name = html_file.stem
# Remove timestamp and clean up filename
if '_202' in book_name: # Remove timestamp like _20251026_083943
book_name = re.sub(r'_\d{8}_\d{6}$', '', book_name)
# Replace problematic characters for filename
safe_filename = re.sub(r'[<>:"/\\|?*]', '_', book_name)
csv_filename = f"{safe_filename}.csv"
csv_path = output_folder_path / csv_filename
print(f"Processing: {html_file.name}")
highlights = self.convert_html_file(str(html_file))
if highlights:
# Write individual CSV file
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=self.csv_headers)
writer.writeheader()
writer.writerows(highlights)
all_combined_highlights.extend(highlights)
individual_files_created.append(csv_path)
print(f" ✓ Created: {csv_path}")
print(f" ✓ Found {len(highlights)} highlights")
else:
print(f" ✗ No highlights found in {html_file.name}")
print()
# Create combined CSV file
if all_combined_highlights:
combined_csv_path = output_folder_path / "all_books_combined.csv"
with open(combined_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=self.csv_headers)
writer.writeheader()
writer.writerows(all_combined_highlights)
print("="*60)
print("BATCH PROCESSING COMPLETE!")
print("="*60)
print(f"Individual CSV files created: {len(individual_files_created)}")
for file in individual_files_created:
print(f" - {file}")
print(f"\nCombined CSV file: {combined_csv_path}")
print(f"Total highlights across all books: {len(all_combined_highlights)}")
print(f"\nAll files saved in: {output_folder_path.absolute()}")
else:
print("No highlights were found in any of the HTML files.")
def main():
parser = argparse.ArgumentParser(
description='Convert HTML book notes to CSV format for Readwise import',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Easy batch processing (recommended):
python html_to_csv_converter.py --batch
# Process a single file:
python html_to_csv_converter.py book.html
# Process all files in a directory:
python html_to_csv_converter.py /path/to/html/files/
"""
)
parser.add_argument('input_path', nargs='?', help='Path to HTML file or directory containing HTML files')
parser.add_argument('-o', '--output', help='Output CSV file path (default: highlights.csv)', default='highlights.csv')
parser.add_argument('--batch', action='store_true',
help='Batch process all HTML files in the "html-files" folder (recommended)')
args = parser.parse_args()
converter = BookNotesConverter()
# Batch processing mode (recommended)
if args.batch:
converter.batch_process_folder()
return
# Manual processing mode
if not args.input_path:
print("Error: input_path is required when not using --batch mode")
print("Use --batch for easy processing, or provide an input path")
parser.print_help()
return
input_path = Path(args.input_path)
if input_path.is_file():
# Single file
if not input_path.suffix.lower() == '.html':
print("Error: Input file must be an HTML file")
return
html_files = [str(input_path)]
elif input_path.is_dir():
# Directory - find all HTML files
html_files = [str(f) for f in input_path.glob('*.html')]
if not html_files:
print("Error: No HTML files found in the specified directory")
return
else:
print("Error: Input path does not exist")
return
# Convert to CSV
converter.convert_to_csv(html_files, args.output)
if __name__ == "__main__":
main()