-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathhtml2txt.py
More file actions
81 lines (67 loc) · 2.55 KB
/
html2txt.py
File metadata and controls
81 lines (67 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
import re
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
mycompile = lambda pat: re.compile(pat, re.UNICODE)
WS_RE = mycompile(r' +')
Url_new = r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))"""
AtMention = r'@[a-zA-Z0-9_]+'
def squeeze_whitespace(s):
new_string = WS_RE.sub(" ",s)
return new_string.strip()
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
self.entityref = re.compile('&[a-zA-Z][-.a-zA-Z0-9]*[^a-zA-Z0-9]')
def handle_data(self, d):
self.fed.append(d)
def handle_starttag(self, tag, attrs):
self.fed.append(' ')
def handle_endtag(self, tag):
self.fed.append(' ')
def handle_entityref(self, name):
if entitydefs.get(name) is None:
m = self.entityref.match(self.rawdata.splitlines()[self.lineno-1][self.offset:])
entity = m.group()
# semicolon is consumed, other chars are not.
if entity is not None:
#print "entity is none"
if entity[-1] != ';':
entity = entity[:-1]
self.fed.append(entity)
else:
self.fed.append('')
else:
self.fed.append(' ')
def get_data(self):
self.close()
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
html = re.sub(r'<code>', '`', html)
html = re.sub(r'</code>', '`', html)
html = re.sub(r'

<pre.*?>.*?</pre>', '#CODE', html) # add .*? to match tag class
html = re.sub(r'<pre.*?>.*?</pre>', '#CODE', html) # add this line to handle code snippet only posts.
#html = re.sub(r'(`(?=\S)|(?<=\S)`)', '', html)
html = re.sub(r'(
)+','\n', html)
s.feed(html)
return s.get_data()
def my_encoder(my_string):
for i in my_string:
try:
yield unicode(i, 'utf-8')
except UnicodeDecodeError:
yield ' ' # or another whitespaces
def html2txt(content):
try:
pro = ''.join( my_encoder( strip_tags(content) ) )
pro = re.sub(r'^ +', '', pro)
pro = re.sub(r'\n +', '\n', pro)
pro = re.sub(r'[\n]+', '\n',pro)
pro = squeeze_whitespace(pro)
#pro = re.sub(Url_new, '#URL', pro, flags=re.DOTALL)
pro = re.sub(AtMention, '@USER', pro)
return pro
except Exception as e:
return content