Skip to content

Commit d74c155

Browse files
committed
Close #97
Add fb2 extension backend
1 parent 9234cc1 commit d74c155

File tree

4 files changed

+72
-1
lines changed

4 files changed

+72
-1
lines changed

README.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ Supported formats
5454
+-----------+-------------------------------------+----------------------------------------------+
5555
| ``epub`` | ``ebooklib`` module | ``ebooklib`` module |
5656
+-----------+-------------------------------------+----------------------------------------------+
57+
| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
58+
+-----------+-------------------------------------+----------------------------------------------+
5759
| ``gif`` | tesseract CLI and pytesserac module | |
5860
+-----------+-------------------------------------+----------------------------------------------+
5961
| ``gz`` | python ``gzip`` module | python ``gzip`` module |
@@ -116,6 +118,8 @@ file extensions:
116118
+-----------+-------------------------------------+----------------------------------------------+
117119
| ``epub`` | ``exiftool`` CLI tool | |
118120
+-----------+-------------------------------------+----------------------------------------------+
121+
| ``fb2`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
122+
+-----------+-------------------------------------+----------------------------------------------+
119123
| ``html`` | ``BeautifulSoup`` module | ``BeautifulSoup`` module |
120124
+-----------+-------------------------------------+----------------------------------------------+
121125
| ``odt`` | ``exiftool`` CLI tool | ``exiftool`` CLI tool |

fulltext/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
from fulltext.backends import __docx # NOQA
114114
from fulltext.backends import __eml # NOQA
115115
from fulltext.backends import __epub # NOQA
116+
from fulltext.backends import __fb2 # NOQA
116117
from fulltext.backends import __gz # NOQA
117118
from fulltext.backends import __html # NOQA
118119
from fulltext.backends import __hwp # NOQA
@@ -276,6 +277,11 @@ def register_backend(mimetype, module, extensions=None):
276277
'fulltext.backends.__epub',
277278
extensions=[".epub"])
278279

280+
register_backend(
281+
"application/fb2",
282+
'fulltext.backends.__fb2',
283+
extensions=[".fb2"])
284+
279285
register_backend(
280286
'application/postscript',
281287
'fulltext.backends.__ps',

fulltext/backends/__fb2.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from __future__ import absolute_import
2+
3+
import re
4+
5+
import bs4
6+
7+
from six import StringIO
8+
from six import PY3
9+
10+
from fulltext.util import BaseBackend
11+
12+
13+
class Backend(BaseBackend):
14+
15+
def setup(self):
16+
self.bs = None
17+
18+
def is_visible(self, elem):
19+
if isinstance(elem, (bs4.element.ProcessingInstruction,
20+
bs4.element.Doctype)):
21+
return False
22+
23+
if elem.parent.name not in ["body", "p"]:
24+
return False
25+
26+
if not PY3:
27+
elem = elem.encode(self.encoding, self.encoding_errors)
28+
29+
if re.match('<!--.*-->', elem):
30+
return False
31+
32+
return True
33+
34+
def handle_fobj(self, f):
35+
bdata = f.read()
36+
tdata = self.decode(bdata)
37+
text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml')
38+
39+
for elem in bs.findAll(text=True):
40+
if elem.parent.name == "empty-line":
41+
text.write(u"\n")
42+
if self.is_visible(elem):
43+
text.write(elem)
44+
text.write(u"\n")
45+
46+
return text.getvalue()
47+
48+
def handle_title(self, f):
49+
fname = ""
50+
s = ""
51+
try:
52+
fname = f.name
53+
except AttributeError:
54+
fname = f
55+
56+
with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read()
57+
bs = bs4.BeautifulSoup(s, 'lxml')
58+
t = getattr(bs, "book-title", None)
59+
if t is None:
60+
return None
61+
return getattr(t, "string", None)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
NAME = 'fulltext'
12-
VERSION = '0.7'
12+
VERSION = '0.8'
1313
if os.name == 'nt' and not sys.maxsize > 2 ** 32:
1414
# https://github.com/btimby/fulltext/issues/79
1515
raise RuntimeError("Python 32 bit is not supported")

0 commit comments

Comments
 (0)