-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscrape.py
More file actions
54 lines (47 loc) · 1.36 KB
/
scrape.py
File metadata and controls
54 lines (47 loc) · 1.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from stackapi import StackAPI
import sys
import pymongo
SITE = StackAPI('stackoverflow')
tag = sys.argv[1]
tags = tag.split()
filename = ""
for tag in tags:
filename += tag + "_"
filename = filename[0:len(filename)-1]
tag_name = filename
filename += ".csv"
c = 0
f = open("extracted_questions/"+filename, "w+")
print("Fetching questions...")
f.write("question_id, tag, link, tags, accepted_answer\n")
fetched_questions = []
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client.stackapi
collec_ques = db.questions
for tag in tags:
questions = SITE.fetch('questions', fromdate=1257136000, todate=1457222400, min=20, tagged=tag, sort='votes', order='desc')
collec_ques.insert_many(questions['items'])
for q in questions['items']:
for t in tags:
if t in q['tags']:
flag = 1
else:
flag = 0
if flag == 0:
continue
question_id = q['question_id']
row = ''
if question_id not in fetched_questions:
fetched_questions.append(question_id)
row = (str(question_id) + "," + tag_name + ", " + q['link'] + ", ")
for t in q['tags']:
row += t + ";"
row = row[0:len(row)-1] + ','
try:
answer_url = q['link'] + '/' + str(q['accepted_answer_id']) + '#' + str(q['accepted_answer_id'])
except:
continue
f.write(row + answer_url + '\n')
c += 1
print("Extracted " + str(c) + " questions and stored in " + filename)
f.close()