-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsearch_engine.py
More file actions
160 lines (125 loc) · 3.65 KB
/
search_engine.py
File metadata and controls
160 lines (125 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def get_page(url):
try:
import urllib3
http = urllib3.PoolManager()
r = http.request('GET', url)
return r.data
except:
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote +1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
#links = get_all_links(get_page("http://www.udacity.com/cs101x/index.html"))
def union(a,b):
for bb in b:
if bb not in a:
a.append(bb)
#return a
def crawl_web(seed,max_depth):
tocrawl = [seed]
crawled = []
next_depth =[]
depth = 0
index = []
while tocrawl and depth <= max_depth:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index,page,content)
union(next_depth, get_all_links(content))
crawled.append(page)
if not tocrawl:
tocrawl, next_depth = next_depth, []
depth = depth + 1
return index
#print crawl_web('http://www.udacity.com/cs101x/index.html',100)
#print crawl_web('http://news.yahoo.com/',4)
# index will be a structure like
# [ [<keyword>, [<url>,count][<url>,count][...]], ...]
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
for element in entry[1]:
if element == url:
return
entry[1].append([url,0])
return
index.append([keyword,[url,0]])
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
def add_page_to_index(index,url,content):
words = content.split()
for word in words:
add_to_index(index,word,url)
def split_string(source,splitlist):
output = []
atsplit = True
for char in source:
if char in splitlist:
atsplit = True
else:
if atsplit:
output.append(char)
atsplit = False
else:
output[-1] = output[-1] + char
return output
def record_user_click(index, keyword, url):
urls = lookup(index, keyword)
if urls:
for entry in urls:
if entry[0] == url:
entry[1] = entry[1]+1
def hash_string(keyword, buckets):
h = 0
for c in keyword:
h = (h + ord(c)) % buckets
return h
#print hash_string("ua",12)
#print hash_string("udacity",12)
def make_hashtable(nbuckets):
table = []
for unused in range(0,nbuckets):
table.append([])
return table
def hashtable_get_bucket(htable,key):
return htable[hash_string(key, len(htable))]
def hashtable_add(htable, key, value):
hashtable_get_bucket(htable, key).append([key,value])
def hashtable_lookup(htable,key):
bucket = hashtable_get_bucket(htable, key)
for entry in bucket:
if entry[0] == key:
return entry[1]
return None
def hashtable_update(htable,key, value):
bucket = hashtable_get_bucket(htable, key)
for entry in bucket:
if entry[0] == key:
entry[1] = value
return
bucket.append([key,value])
#print make_hashtable(3)
index=[]
add_page_to_index(index,'fake.test',"This is a test")
#print index
add_page_to_index(index,'not.test',"This is not a test")
#print index