-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebsite.py
More file actions
228 lines (195 loc) · 8.33 KB
/
website.py
File metadata and controls
228 lines (195 loc) · 8.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import importlib as imp
import sys
import time
from collections import deque
from pprint import pprint
import requests
from utils.robotexclusionrulesparser import RobotExclusionRulesParser
from utils import logutils, urlutils, crawlutils, profileutil, listutils
sys.path.append('filters')
from utils.bloomdClient import bloom
import config
from utils import queueconnection
import json
import request
class Website:
def __init__(self, startUrl, initialMeta = {}):
self.startUrl = startUrl
self.parsedUrls = []
self.externalLinks = []
self.domain = urlutils.getDomain(startUrl)
self.parsedPages = 0
self.lastRequest = time.time()
self.requestMinInterval = 1 #not used
self.parseLimit = config.pagesParsedPerSite
self.fails = 0
self.inProgress = False
self.repr = RobotExclusionRulesParser()
self.initialMeta = initialMeta #contains metainfo for filters
self.updateMode = True
# self.client = BloomdClient(["107.170.243.148"])
# self.bloom = self.client.create_filter("domains")
try:
# TODO: get protocol for startURL
response = crawlutils.getSource('http://' + self.domain + '/robots.txt')
robotsTxt = response.text
# get last url in a sequence of redirects
if response.reason == 'OK':
self.startUrl = response.url
except:
robotsTxt = ''
self.repr.parse(robotsTxt)
self.firstPageFiltersData = {}
# check starturl against robots.txt
if not self.repr.is_allowed(config.botName, startUrl):
# utils.log('crawling root forbiden by robots.txt')
self.ended = True
else:
self.ended = False
self.filters = [
{'name': 'textratio'},
{'name': 'title'},
{'name': 'keywords'},
{'name': 'description'},
{'name': 'numberofimages'},
{'name': 'numberoflinks'}
]
# Load filters
for filter in self.filters:
filter['mod'] = imp.import_module(filter['name'])
self.firstPageFilters = [
{'name': 'encoding'}
]
for filter in self.firstPageFilters:
filter['mod'] = imp.import_module(filter['name'])
# TODO: this need to be unique
# TODO: put a bloomfiltere here
self.toParse = deque([startUrl])
def crawlNextUrl(self):
if self.ended:
return False
if len(self.toParse) > 0:
currentUrl = self.toParse.popleft()
else:
self.save()
self.ended = True
if config.log:
logutils.log('end: empty internal links queue', self.toParse)
return False
if config.log:
print(currentUrl)
self.lastRequest = time.time()
response = request.getSource(currentUrl)
if not self.isValidResponse(response):
# utils.log('invalid response', currentUrl)
return False
source = response.textcontent
# GET NEW LINKS
newLinks = crawlutils.getLinks(source, currentUrl)
# remove external links
internalNewLinks = [link for link in newLinks if urlutils.isInternalLink(link, currentUrl)]
# remove current url
internalNewLinks = [link for link in internalNewLinks if link != currentUrl]
# remove invalid extensions
internalNewLinks = [link for link in internalNewLinks if urlutils.validExtension(link)]
# remove already parsed URLs
internalNewLinks = [link for link in internalNewLinks if link not in [l['url'] for l in self.parsedUrls]]
# remove already enqueued URLs
internalNewLinks = [link for link in internalNewLinks if link not in self.toParse]
# remove URLs restricted by robots.txt
internalNewLinks = [link for link in internalNewLinks if self.repr.is_allowed(config.botName, link)]
# if config.debug: utils.log('links', internalNewLinks)
self.toParse.extend(internalNewLinks)
# EXTERNAL LINKS
# remove internal links
externalNewLinks = internalNewLinks = [link for link in newLinks if not urlutils.isInternalLink(link, currentUrl)]
# remove already enqueued URLs
externalNewLinks = [link for link in externalNewLinks if link not in self.externalLinks]
self.externalLinks.extend(externalNewLinks)
self.parsedPages += 1
meta = {"url": response.url, "response": response, "headers": response.headers, 'newLinks': newLinks, 'internalNewLinks': internalNewLinks, 'domain':self.domain}
meta.update(self.initialMeta)
# RUN FIRST PAGE FILTERS
if self.parsedPages == 1:
result = self.runFilters(source, meta, self.firstPageFilters)
self.firstPageFiltersData = result
# RUN PAGE FILTERS
result = self.runFilters(source, meta, self.filters)
self.parsedUrls.append(result)
if self.parsedPages >= self.parseLimit:
self.save()
self.ended = True
if config.log:
logutils.log('end: hit parse limit')
return False
return True
def isValidResponse(self, response):
# some response filters
if not response.status_code:
return False
else:
if response.status_code < 200 or response.status_code > 299:
return False
# avoid non text/html resources
if 'content-type' in response.headers:
if 'text' not in response.headers['content-type']:
return False
return True
def runFilters(self, source, meta, filters):
result = {}
if config.runFilters:
for filter in filters:
meta.update(result)
start = time.clock()
result['filter_'+filter['name']] = filter['mod'].filter(source, meta)
profileutil.profile.save(filter['name'], time.clock() - start)
result['url'] = meta['url']
return result
def save(self):
keys = ['startUrl', 'firstPageFiltersData', 'toParse', 'parsedUrls', 'domain', 'parsedPages', 'lastRequest']
result = {}
for key in keys:
item = getattr(self, key)
if isinstance(item, deque):
item = list(item)
result[key] = item
# Transofrm list in object...
# We need this strnge format for partial update in ElasticSearch
data = {}
i=0
for item in result['parsedUrls']:
data[str(i)] = item
i+=1
result['parsedUrls'] = data
if config.debug:
pprint(result)
# profileutil.profile.show()
# Save / update in ES
if config.save and len(result['parsedUrls']) > 0:
if self.updateMode:
updateData = {}
updateData['doc'] = result
updateData['doc_as_upsert'] = True
r = requests.post(config.elasticSearchEntity + '/' + result['domain'] + '/_update', data=json.dumps(updateData))
else:
r = requests.put(config.elasticSearchEntity + '/' + result['domain'], data=json.dumps(result))
bloom.add(result['domain'])
if config.debug: print(r.content)
self.saveExternalUrls()
def saveExternalUrls(self):
if config.saveExternalUrls:
externalDomains = [urlutils.getDomain(domain) for domain in self.externalLinks]
print(externalDomains)
externalDomains = [domain for domain in externalDomains if domain]
# remove www.
externalDomains = [domain if not domain.startswith('www.') else domain[4:] for domain in externalDomains]
externalDomains = listutils.unique(externalDomains)
for domain in externalDomains:
schemaLessDomain = urlutils.removeSchema(domain)
if bloom.add(domain):
item = {}
item['domain'] = schemaLessDomain
# pprint(item)
queueconnection.push(config.externalCollection, json.dumps(item))
def update(self):
pass