-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAlgorithmEngine.py
More file actions
240 lines (190 loc) · 7.83 KB
/
AlgorithmEngine.py
File metadata and controls
240 lines (190 loc) · 7.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
print("(Running imports...)")
import math
import csv
import numpy
import sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn import metrics
import joblib #THIS COULD BE VERY INSECURE, CHECK WITH SOMEONE WHO ACTUALLY KNOWS WHAT THEY'RE TALKING ABOUT!
#import metrics from sklearn
print("(Finished imports.)")
class Sharable():
#member variables
algorithm = None #the trained algorithm object
name = ""
version = ""
statistics = None #this will probably be an sklearn helper class. We might want to wrap it up with other stuff also
computation = None #we might have to implement this in numpy
client_history = None #client-side provenance
usage = None #input and output types, along with usage intentions
def __init__(self, algorithm, name, version, statistics, computation, client_history, usage):
self.algorithm = algorithm
self.name = name
self.version = version
self.statistics = statistics
self.client_history = client_history
self.usage = usage
def saveState(self, path):
joblib.dump(self, path + name + "_" + version + ".saf", compress=2, cache_size=100, protocol=None)
def loadState(file): #verify upstream that this is a .saf, not a .paf
return joblib.load(file)
def exportPAF(self, path):
#some sort of hashing thing to allow the ability to verify integrity
joblib.dump(self, path + name + "_" + version + ".paf", compress=9, cache_size=100, protocol=None)
def importPAF(self, path):
#some sort of hash verification to ensure this is the expected thing
return joblib.load(file)
class SupervisedClassifier():
#member variables
A = None #root algorithm
ID = "FF-00" #no algorithm set
def __init__(self, id): #right... *two* underscores
if(id == "naive_bayes" or id == "00-00"):
self.A = GaussianNB()
self.ID = "00-00"
elif(id == "feed_forward" or id == "00-01"):
self.A = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) #gonna need a way to pass in arguments at some point
self.ID = "00-01"
elif(id == "LinearRegression" or id == "00-02"):
self.A = linear_model.LinearRegression()
self.ID = "00-02"
def predict(self, set):
return self.A.predict(set)
def fit(self, set, exp):
return self.A.fit(set, exp)
#def fit(self, set, exp):
# return fit(self, set, exp, 0.1) #reserve 10% of the dataset for accuracy test by default
def metafit(self, I, set, exp, ratio=0.1): #yes, I know this could be handled upstream, but I think it fits better here NO. no no no no no no, this is dumb
return fit(self, I.predict(I, set), exp, ratio)
#def metafit(self, I, set, exp):
# return fit(self, I.predict(I, set), exp)
class Datatype():
# can be Null, Boolean, Integer, BoundedReal, Real, Complex, String, Image, or Aggregate
name = "unnamed";
rank = -1;
#private Datatype[] bundle = null;
class Dataset():
#member variables
rawData = None #matrix of strings loaded directly from a dataset, top row is categories
normalData = None #normalized to passed specifications and uniform type (default to reals (internally 32-bit floating points) bounded by [-1, 1])
datatype = None #datatype of the set TODO make this work
dataRows = 0
dataCols = 0
categories = [] #keep track of the name of the categories
#type = "real"
#bound = [-1, 1] # NOTE: based on current method, not using hard limits of -1 and 1, rather using -mean / std method, which seems to be fairly conventional
def indexColumn(self, dex, normalized=False):
return self.getRawData[:, dex]
def getColumn(self, name, normalized=False):
dex = -1
#print(self.getRawData)
for i in range(0, len(self.categories)):
if(name == self.categories[i]):
dex = i
break
if(dex == -1):
return None
return self.getNormalData()[:,dex] if normalized else self.getRawData()[:,dex]
def excludeColumn(self, name, normalized=False):
dex = -1
ref = self.getNormalData() if normalized else self.getRawData()
print(str(ref) + "\n" + str(ref) + "\nRows: " + str(len(ref)) + "\tCols: " + str(len(ref[0])))
r = numpy.array([len(ref) - 1, len(ref[0])])
for i in range(0, len(self.categories)):
if(name != self.categories[i]):
dex = i
ref = numpy.delete(ref, (dex), axis=1)
print(str(ref) + "\n" + str(ref) + "\nRows: " + str(len(ref)) + "\tCols: " + str(len(ref[0])))
return ref
def loadFromText(self, fileName, delim):
print("(Loading data...)")
self.rawData = numpy.genfromtxt(fileName, dtype=None, delimiter=delim)
# get categories
csv_reader = csv.reader(open(fileName), delimiter=delim, quotechar='"')
self.categories = csv_reader.next()
# remove header (already tried using skip_header in genfromtxt, but for some reason, shape variable doesn't work with that...)
self.rawData = numpy.delete(self.rawData, 0, 0)
# Get dimensions
self.dataRows = self.rawData.shape[0]
self.dataCols = self.rawData.shape[1]
print("(Data loaded!)")
print("DATA SIZE: " + str(self.dataRows) + " rows " + str(self.dataCols) + " cols")
print("Categories:")
for cat in self.categories:
print("\t" + cat)
def normalizeData(self): # normalizes all cols (inputs) and stores it in normalData
print("(Normalizing...)")
self.normalData = numpy.empty_like(self.rawData)
for c in range(0, self.dataCols):
self.normalData[:,c] = self.normalizeVar(self.rawData[:,c])
print("(Normalizing complete!)")
# normalizes col of data (one input variable)
# TODO: string normalization is using a very unofficial method and should be worked on further!
def normalizeVar(self, rawCol):
col = numpy.copy(rawCol) # make a copy of the data to play with
# remove quotes
for i in range(0, self.dataRows):
col[i] = col[i].replace('"', '').strip() # TODO: escaped quotes are allowed!
# check if num
sampleEntry = col[0]
print("checking col type with sample " + str(sampleEntry)) # DEBUG
if self.isNumber(sampleEntry) == False: # explicitly checking false cause idk how to do basic negation in python??
print("--NORMALIZING STRING COLUMN--") # DEBUG
for i in range(0, self.dataRows):
# right now, just add up all ascii values of char strings
stringSum = 0
for j in range(0, len(col[i])):
stringSum += ord(col[i][j])
col[i] = stringSum
# "cast" so numpy doesn't complain
col = col.astype(float)
# numpy magic!!!
colSum = col.sum()
colMean = col.mean()
sDev = col.std()
# adjust all col values
for i in range(0, self.dataRows):
col[i] = (col[i] - colMean) / sDev
return col
def getRawData(self):
return self.rawData
def getNormalData(self):
return self.normalData
def isNumber(self, num): # http://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float-in-python
try:
float(num)
return True
except ValueError:
return False
print("Starting Engine Test")
sampleCol = "freetime"
inputs = Dataset()
inputs.loadFromText(".\\TestData\\student-mat.csv", ";")
inputs.normalizeData()
infotype = Datatype()
infotype.name = "Student Data"
infotype.rank = inputs.dataCols
inputs.datatype = infotype
testin = Dataset()
testin.loadFromText(".\\TestData\\student-por.csv", ";")
testin.normalizeData()
testin.datatype = infotype
print("Data successfully loaded\nCreating Algorithm")
algorithm = SupervisedClassifier("00-02")
print("Algorithm successfully built")
S = numpy.asarray(inputs.getColumn(sampleCol, True), dtype="float_")
#print("Column " + str(S))
T = numpy.asarray(inputs.excludeColumn(sampleCol, True), dtype="float_")
print("Column " + str(T))
print("Training Algorithm")
#lin = linear_model.LinearRegression()
algorithm = algorithm.fit(T, S)
print("Algorithm trained")
#print(algorithm.predict(T))
print(algorithm.score(T, S))
print("Testing extends")
S1 = numpy.asarray(testin.getColumn(sampleCol, True), dtype="float_")
T1 = numpy.asarray(testin.excludeColumn(sampleCol, True), dtype="float_")
print(algorithm.score(T1, S1))
#print("nothing broken")