From 2cec96d2de2ffaa14dd37dcab7f663d6978ad380 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 09:25:06 +0000 Subject: [PATCH] Filter pale chapters to extract only chapter content Add extract_chapter_content() that uses regex to pull out just the entry-title and entry-content from each downloaded page, stripping comments, sharing buttons, navigation, related posts, and other non-chapter elements. Co-Authored-By: tom mottes --- save_ebook.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/save_ebook.py b/save_ebook.py index b1992df..da3b451 100644 --- a/save_ebook.py +++ b/save_ebook.py @@ -2,6 +2,7 @@ # cd d:\tom\scripts # py save_ebook.py +import re import requests twig_base = "https://twigserial.wordpress.com" @@ -679,11 +680,36 @@ # out.write(r.text) # out.write("\n\n" + "="*50 + "\n\n") +def extract_chapter_content(html): + title_match = re.search(r'

(.*?)

', html, re.DOTALL) + title = title_match.group(0) if title_match else "" + content_match = re.search( + r'
(.*?)
', + html, + re.DOTALL, + ) + content = content_match.group(1).strip() if content_match else "" + if content: + content = re.sub( + r'