-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
64 lines (54 loc) · 2.17 KB
/
scraper.py
File metadata and controls
64 lines (54 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
import urllib.request
import urllib.parse
from config import API, HEADERS
def fetch(url):
req = urllib.request.Request(url, headers=HEADERS)
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.read().decode("utf-8", errors="replace")
def search(query):
url = f"{API}/search?q={urllib.parse.quote(query)}"
html = fetch(url)
pattern = re.compile(r'<div\s+class="book-item">([\s\S]*?)<\/div>\s*<\/div>', re.IGNORECASE)
results = []
for m in pattern.finditer(html):
chunk = m.group(1)
manga_id = re.search(r'href="\/([^"]+)"', chunk, re.IGNORECASE)
title = re.search(r'<h3>\s*<a[^>]+title="([^"]+)"', chunk, re.IGNORECASE)
manga_id = manga_id.group(1) if manga_id else None
title = title.group(1).strip() if title else (manga_id or "Untitled")
if manga_id:
results.append({"id": manga_id, "title": title})
return results
def find_chapters(manga_id):
html = fetch(f"{API}/{manga_id}")
book_id = re.search(r'var\s+bookId\s*=\s*(\d+);', html, re.IGNORECASE)
if not book_id:
return []
book_id = book_id.group(1)
chapters_html = fetch(f"{API}/api/manga/{book_id}/chapters?source=detail")
pattern = re.compile(
r'<li[^>]*>[\s\S]*?<a[^>]+href="([^"]+)"[^>]*>[\s\S]*?<strong[^>]*class="chapter-title"[^>]*>([^<]+)<\/strong>',
re.IGNORECASE
)
chapters = []
for m in pattern.finditer(chapters_html):
href = m.group(1).strip()
title = m.group(2).strip()
chapter_id = href.lstrip("/")
num = re.search(r'Chapter\s+([\d.]+)', title, re.IGNORECASE)
num = num.group(1) if num else "0"
chapters.append({"id": chapter_id, "title": title, "num": float(num)})
return sorted(chapters, key=lambda c: c["num"])
def find_pages(chapter_id):
html = fetch(f"{API}/{chapter_id}")
match = re.search(r"var\s+chapImages\s*=\s*'([^']+)'", html, re.IGNORECASE)
if not match:
return []
raw = match.group(1).split(",")
pages = []
for img in raw:
img = img.strip()
if img:
pages.append(img if img.startswith("http") else f"{API}{img}")
return pages