codomator/randomforest1.py at master · datu925/codomator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import csv
import re

#this was my attempt to write a machine learning algorithm using scikit-learn
#we ended up going with an organization called DrivenData to finish this task
#you can find competition history here: www.drivendata.org/competitions/4/


whitelist = {}

def output_predictions(filename,testingdata, classf):
    with open(filename, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile)
        for item in classf.predict(testingdata):
            spamwriter.writerow([item])

def import_whitelist(filename):
    with open(filename, 'rb') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            whitelist[row[1]] = row[0]

def test_whitelist(whitelist, word):
    if word in whitelist:
        return True
    for entry in whitelist:
        if whitelist[entry] == 'contains':
            if entry in word:
                return True
            else:
                continue
        elif whitelist[entry] == 'regex search':
            if re.search(entry, word) != None:
                return True
            else:
                continue
    return False


import_whitelist('whitelist.csv')

def import_fin_files(filename, whitelist):
    """ this function takes a financial file and a whitelist and outputs a list containing a text concatenation of white-listed verified words, 1 item per line"""
    with open('TrainingSetFull.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile)
        headers = spamreader.next()
        j = 0
        for cell in headers:
            if cell == 'Operating Status':
                coding_count = j
            else:
                j += 1
        fin_file = []
        operating_list = []
        coding_targets = []
        for row in spamreader:
            string_list = []
            coding_targets.append(row[40])
            operating_list.append(row[coding_count])
            for i in xrange(1,coding_count):
                for word in row[i].lower().split():
                    if test_whitelist(whitelist, word):
                        string_list.append(word)
            fin_file.append(' '.join(string_list))
        return fin_file, coding_targets, operating_list

#fin, targets, operating = import_fin_files('TrainingSetFull.csv',whitelist)

def save_file(filename, data):
    """saves data to a file for quicker retrieval later.  Best for fin_file"""
    with open(filename, 'wb') as csvfile:
        spamwriter = csv.writer(csvfile)
        for item in data:
            spamwriter.writerow([item])

#save_file('fin_file.csv',fin)
#save_file('op_file.csv',operating)
#save_file('targets.csv',targets)

def load_file(filename):
    """ loads data from file if processing takes a long time"""
    with open(filename, 'rb') as csvfile:
        spamreader = csv.reader(csvfile)
        data = []
        for row in spamreader:
            data.append(row[0])
    return data

fin = load_file('fin_file.csv')
targets = load_file('targets.csv')
operating = load_file('op_file.csv')

#Set parameters
StartRow = 1
StartTest = 12000
EndTest = 13000

train_file = [fin[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
train_targets = [targets[x] for x in xrange(StartRow, StartTest) if operating[x] == 'PreK-12 Operating']
test_file = [fin[x] for x in xrange(StartTest, EndTest)]


#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1,charset_error='ignore')

X_train = vectorizer.fit_transform(train_file)
X_train = X_train.todense()

#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
#clf = MultinomialNB()
#clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True)
clf = RandomForestClassifier(n_estimators = 500, compute_importances=True)
#MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit_transform(X_train, train_targets)

testdata = vectorizer.transform(test_file)
testdata = testdata.todense()


output_predictions('predictions.csv',testdata, clf)