-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandomforest1.py
More file actions
124 lines (101 loc) · 4.09 KB
/
Copy pathrandomforest1.py
File metadata and controls
124 lines (101 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import csv
import re
#this was my attempt to write a machine learning algorithm using scikit-learn
#we ended up going with an organization called DrivenData to finish this task
#you can find competition history here: www.drivendata.org/competitions/4/
whitelist = {}
def output_predictions(filename,testingdata, classf):
with open(filename, 'wb') as csvfile:
spamwriter = csv.writer(csvfile)
for item in classf.predict(testingdata):
spamwriter.writerow([item])
def import_whitelist(filename):
with open(filename, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
for row in spamreader:
whitelist[row[1]] = row[0]
def test_whitelist(whitelist, word):
if word in whitelist:
return True
for entry in whitelist:
if whitelist[entry] == 'contains':
if entry in word:
return True
else:
continue
elif whitelist[entry] == 'regex search':
if re.search(entry, word) != None:
return True
else:
continue
return False
import_whitelist('whitelist.csv')
def import_fin_files(filename, whitelist):
""" this function takes a financial file and a whitelist and outputs a list containing a text concatenation of white-listed verified words, 1 item per line"""
with open('TrainingSetFull.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile)
headers = spamreader.next()
j = 0
for cell in headers:
if cell == 'Operating Status':
coding_count = j
else:
j += 1
fin_file = []
operating_list = []
coding_targets = []
for row in spamreader:
string_list = []
coding_targets.append(row[40])
operating_list.append(row[coding_count])
for i in xrange(1,coding_count):
for word in row[i].lower().split():
if test_whitelist(whitelist, word):
string_list.append(word)
fin_file.append(' '.join(string_list))
return fin_file, coding_targets, operating_list
#fin, targets, operating = import_fin_files('TrainingSetFull.csv',whitelist)
def save_file(filename, data):
"""saves data to a file for quicker retrieval later. Best for fin_file"""
with open(filename, 'wb') as csvfile:
spamwriter = csv.writer(csvfile)
for item in data:
spamwriter.writerow([item])
#save_file('fin_file.csv',fin)
#save_file('op_file.csv',operating)
#save_file('targets.csv',targets)
def load_file(filename):
""" loads data from file if processing takes a long time"""
with open(filename, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
data = []
for row in spamreader:
data.append(row[0])
return data
fin = load_file('fin_file.csv')
targets = load_file('targets.csv')
operating = load_file('op_file.csv')
#Set parameters
StartRow = 1
StartTest = 12000
EndTest = 13000
train_file = [fin[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
train_targets = [targets[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
test_file = [fin[x] for x in xrange(StartTest, EndTest)]
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1,charset_error='ignore')
X_train = vectorizer.fit_transform(train_file)
X_train = X_train.todense()
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
#clf = MultinomialNB()
#clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)
clf = RandomForestClassifier(n_estimators = 500, compute_importances=True)
#MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit_transform(X_train, train_targets)
testdata = vectorizer.transform(test_file)
testdata = testdata.todense()
output_predictions('predictions.csv',testdata, clf)