|
| 1 | +from __future__ import absolute_import |
| 2 | + |
| 3 | +import re |
| 4 | + |
| 5 | +import bs4 |
| 6 | + |
| 7 | +from six import StringIO |
| 8 | +from six import PY3 |
| 9 | + |
| 10 | +from fulltext.util import BaseBackend |
| 11 | + |
| 12 | + |
| 13 | +class Backend(BaseBackend): |
| 14 | + |
| 15 | + def setup(self): |
| 16 | + self.bs = None |
| 17 | + |
| 18 | + def is_visible(self, elem): |
| 19 | + if isinstance(elem, (bs4.element.ProcessingInstruction, |
| 20 | + bs4.element.Doctype)): |
| 21 | + return False |
| 22 | + |
| 23 | + if elem.parent.name not in ["body", "p"]: |
| 24 | + return False |
| 25 | + |
| 26 | + if not PY3: |
| 27 | + elem = elem.encode(self.encoding, self.encoding_errors) |
| 28 | + |
| 29 | + if re.match('<!--.*-->', elem): |
| 30 | + return False |
| 31 | + |
| 32 | + return True |
| 33 | + |
| 34 | + def handle_fobj(self, f): |
| 35 | + bdata = f.read() |
| 36 | + tdata = self.decode(bdata) |
| 37 | + text, bs = StringIO(), bs4.BeautifulSoup(tdata, 'lxml') |
| 38 | + |
| 39 | + for elem in bs.findAll(text=True): |
| 40 | + if elem.parent.name == "empty-line": |
| 41 | + text.write(u"\n") |
| 42 | + if self.is_visible(elem): |
| 43 | + text.write(elem) |
| 44 | + text.write(u"\n") |
| 45 | + |
| 46 | + return text.getvalue() |
| 47 | + |
| 48 | + def handle_title(self, f): |
| 49 | + fname = "" |
| 50 | + s = "" |
| 51 | + try: |
| 52 | + fname = f.name |
| 53 | + except AttributeError: |
| 54 | + fname = f |
| 55 | + |
| 56 | + with open(fname, "r", encoding = self.encoding, errors = self.encoding_errors) as book: s = book.read() |
| 57 | + bs = bs4.BeautifulSoup(s, 'lxml') |
| 58 | + t = getattr(bs, "book-title", None) |
| 59 | + if t is None: |
| 60 | + return None |
| 61 | + return getattr(t, "string", None) |
0 commit comments