Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 130 additions & 4 deletions pphtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@
# pylint: disable=C0103, R0912, R0915
# pylint: disable=too-many-instance-attributes, too-many-locals, no-self-use

import sys
import os
import argparse
from time import strftime
import itertools
import os
import sys
from html.parser import HTMLParser
import regex as re # for unicode support (pip install regex)
from time import strftime

from PIL import Image # from pip install pillow
import regex as re # for unicode support (pip install regex)
import roman # for pphtml


class MyHTMLParser(HTMLParser):
Expand Down Expand Up @@ -73,6 +76,8 @@ class initialization
self.udefcss = {} # user defined CSS
self.usedcss = {} # CSS used by user
self.errormessage = "" # for unwrap failure
self.ranges_arabic = []
self.ranges_roman = []

def crash(self):
self.saveReport()
Expand Down Expand Up @@ -462,6 +467,126 @@ def cleanExt(self):
r.append("[pass] external links check")
self.apl(r)

def documentInfo(self):
"""
Section to contain general document information
"""
self.ap("")
t = "document info"
self.ap("----- {} ".format(t) + "-" * (73 - len(t)))
self.findPageRanges()


def findPageRanges(self):
"""
Find ranges of page locations for later reporting
"""
r = []
pages_arabic = []
pages_roman = []

# Look for <a>, <div>, <span> with 'id' attribute; \3 is the match
pat1 = re.compile(r"""<(a|div|span)\s+[^>]*\bid=["'](page|pg)_?([\divxlcdm]+)["']""",
re.IGNORECASE)
# Alternately: look for span class=pagenum; \1 is the match
pat2 = re.compile(r"""<span\s+[^>]*\bclass=["']pagenum['"].*>([^<]+)</span""",
re.IGNORECASE)

for line in self.wb:
m = pat1.search(line)
if m:
if m.group(3).isnumeric():
pages_arabic.append(int(m.group(3)))
else:
try:
n = roman.fromRoman(m.group(3))
pages_roman.append(m.group(3))
except roman.InvalidRomanNumeralError:
# we tried...
continue

# don't check the same line again if it already matched
continue

m = pat2.search(line)
if m:
text = m.group(1)
text = text.replace("Page_", "")
text = text.replace("page_", "")
text = text.replace("page", "")
text = text.replace("[", "")
text = text.replace("]", "")
text = text.replace("p.", "")
text = text.replace("P.", "")
text = text.replace("Pg.", "")
text = text.replace("Pg", "")
text = text.strip()

if m.group(1).isnumeric():
pages_arabic.append(int(m.group(1)))
else:
try:
n = roman.fromRoman(m.group(3))
pages_roman.append(m.group(3))
except roman.InvalidRomanNumeralError:
# we tried...
continue

# Create ranges
# Uses Python magic to go from [1,2,5,6,7] to [[1,2],[5,7]]

# Arabic
R = (list(x) for _, x in itertools.groupby(pages_arabic, lambda x, c=itertools.count(): next(c)-x))
self.ranges_arabic = [[r[0], r[-1]] for r in R]
last = -1
for myrange in self.ranges_arabic:
page_l, page_h = myrange

if page_h == last:
myrange.append("Page (or set of pages) is duplicated")
elif page_h < last:
myrange.append("Page (or set of pages) is out of sequence")
else:
myrange.append(None)

last = page_h

# Roman
R = (list(x) for _, x in itertools.groupby(pages_roman, lambda x, c=itertools.count(): next(c)-roman.fromRoman(x.upper())))
self.ranges_roman = [[r[0], r[-1]] for r in R]
last = -1
for myrange in self.ranges_roman:
page_l, page_h = myrange

if roman.fromRoman(page_h.upper()) == last:
myrange.append("Page (or set of pages) is duplicated")
elif roman.fromRoman(page_h.upper()) < last:
myrange.append("Page (or set of pages) is out of sequence")
else:
myrange.append(None)

last = roman.fromRoman(page_h.upper())

# render found page ranges list (roman)
_rbuf = []
for _r in self.ranges_roman:
if _r[0] == _r[1]:
_rbuf.append(_r[0])
else:
_rbuf.append(f"{_r[0]}–{_r[1]}")
r.append(f"[info] page numbers ( roman): {', '.join(_rbuf)}")

# render found page ranges list (arabic)
_abuf = []
for _a in self.ranges_arabic:
if _a[0] == _a[1]:
_abuf.append(str(_a[0]))
else:
_abuf.append(f"{_a[0]}–{_a[1]}")
r.append(f"[info] page numbers (arabic): {', '.join(_abuf)}")

self.apl(r)

def linkToCover(self):
"""
either: provide a link in the document head, or
Expand Down Expand Up @@ -1255,6 +1380,7 @@ def run(self):
"""

self.loadFile()
self.documentInfo()
self.imageTests()
self.linkTests()
self.ppvTests()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
Pillow>=7.1.0
regex>=2019.4.12
roman>=5.2