From 2cec96d2de2ffaa14dd37dcab7f663d6978ad380 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Sat, 21 Feb 2026 09:25:06 +0000
Subject: [PATCH] Filter pale chapters to extract only chapter content

Add extract_chapter_content() that uses regex to pull out just the
entry-title and entry-content from each downloaded page, stripping
comments, sharing buttons, navigation, related posts, and other
non-chapter elements.

Co-Authored-By: tom mottes <tom.mottes@gmail.com>
---
 save_ebook.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/save_ebook.py b/save_ebook.py
index b1992df..da3b451 100644
--- a/save_ebook.py
+++ b/save_ebook.py
@@ -2,6 +2,7 @@
 # cd d:\tom\scripts
 # py save_ebook.py
 
+import re
 import requests
 
 twig_base = "https://twigserial.wordpress.com"
@@ -679,11 +680,36 @@
 #             out.write(r.text)
 #             out.write("\n\n" + "="*50 + "\n\n")
 
+def extract_chapter_content(html):
+    title_match = re.search(r'<h1 class="entry-title">(.*?)</h1>', html, re.DOTALL)
+    title = title_match.group(0) if title_match else ""
+    content_match = re.search(
+        r'<div class="entry-content">(.*?)</div><!-- \.entry-content -->',
+        html,
+        re.DOTALL,
+    )
+    content = content_match.group(1).strip() if content_match else ""
+    if content:
+        content = re.sub(
+            r'<div id="jp-post-flair".*',
+            "",
+            content,
+            flags=re.DOTALL,
+        )
+        content = re.sub(
+            r'<div id="jp-relatedposts".*',
+            "",
+            content,
+            flags=re.DOTALL,
+        )
+    return f"{title}\n{content.strip()}"
+
+
 with open("pale_full.html", "w", encoding="utf-8") as out:
     for chapter in pale_chapters.splitlines():
         url = chapter.strip()
         r = requests.get(url)
         real_url = r.url
         print(real_url)
-        out.write(r.text)
+        out.write(extract_chapter_content(r.text))
         out.write("\n\n" + "="*50 + "\n\n")